<html>
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1, maximum-scale=1, user-scalable=no" name="viewport"/>
  <title>
   【投稿】Machine Learning With Spark Note 1:数据基本处理  | 数螺 | NAUT IDEA
  </title>
  <link href="http://cdn.bootcss.com/bootstrap/3.3.6/css/bootstrap-theme.min.css" rel="stylesheet"/>
  <link href="http://cdn.bootcss.com/bootstrap/3.3.6/css/bootstrap.min.css" rel="stylesheet"/>
  <style type="text/css">
   #xmain img {
                  max-width: 100%;
                  display: block;
                  margin-top: 10px;
                  margin-bottom: 10px;
                }

                #xmain p {
                    line-height:150%;
                    font-size: 16px;
                    margin-top: 20px;
                }

                #xmain h2 {
                    font-size: 24px;
                }

                #xmain h3 {
                    font-size: 20px;
                }

                #xmain h4 {
                    font-size: 18px;
                }


                .header {
	           background-color: #0099ff;
	           color: #ffffff;
	           margin-bottom: 20px;
	        }

	        .header p {
                  margin: 0px;
                  padding: 10px 0;
                  display: inline-block;  
                  vertical-align: middle;
                  font-size: 16px;
               }

               .header a {
                 color: white;
               }

              .header img {
                 height: 25px;
              }
  </style>
  <script src="http://cdn.bootcss.com/jquery/3.0.0/jquery.min.js">
  </script>
  <script src="http://nautstatic-10007657.file.myqcloud.com/static/css/readability.min.js" type="text/javascript">
  </script>
  <script type="text/javascript">
   $(document).ready(function() {
                 var loc = document.location;
                 var uri = {
                  spec: "http://dataunion.org/22496.html",
                  host: "http://dataunion.org",
                  prePath: "http://dataunion.org",
                  scheme: "http",
                  pathBase: "http://dataunion.org/"
                 };
    
                 var documentClone = document.cloneNode(true);
                 var article = new Readability(uri, documentClone).parse();
     
                 document.getElementById("xmain").innerHTML = article.content;
                });
  </script>
  <!-- 1466459803: Accept with keywords: (title(0.333333333333):基本,投稿,社区,数盟,Spark,数据, topn(0.466666666667):社区,总数,职业规划,数盟,信息,行业资讯,分布,数据挖掘,Python,电影,基础架构,计算,文章,异值,可视化,Spark,数据,基本,投稿,python,用户,次数,spark,编程语言,定值,特征,评论,归一化,1号店,职业).-->
 </head>
 <body onload="">
  <div class="header">
   <div class="container">
    <div class="row">
     <div class="col-xs-6 col-sm-6 text-left">
      <a href="/databee">
       <img src="http://nautidea-10007657.cos.myqcloud.com/logo_white.png"/>
      </a>
      <a href="/databee">
       <p>
        数螺
       </p>
      </a>
     </div>
     <div class="hidden-xs col-sm-6 text-right">
      <p>
       致力于数据科学的推广和知识传播
      </p>
     </div>
    </div>
   </div>
  </div>
  <div class="container text-center">
   <h1>
    【投稿】Machine Learning With Spark Note 1:数据基本处理
   </h1>
  </div>
  <div class="container" id="xmain">
   ﻿﻿
   <title>
    【投稿】Machine Learning With Spark Note 1:数据基本处理 | 数盟社区
   </title>
   <!-- All in One SEO Pack 2.2.7.6.2 by Michael Torbert of Semper Fi Web Design[32,92] -->
   <!-- /all in one seo pack -->
   <!--
<div align="center">
<a href="http://strata.oreilly.com.cn/hadoop-big-data-cn?cmp=mp-data-confreg-home-stcn16_dataunion_pc" target="_blank"><img src="http://dataunion.org/wp-content/uploads/2016/05/stratabj.jpg"/ ></a>
</div>
-->
   <header id="header-web">
    <div class="header-main">
     <hgroup class="logo">
      <h1>
       <a href="http://dataunion.org/" rel="home" title="数盟社区">
        <img src="http://dataunion.org/wp-content/themes/yzipi/images/logo.png"/>
       </a>
      </h1>
     </hgroup>
     <!--logo-->
     <nav class="header-nav">
      <ul class="menu" id="menu-%e4%b8%bb%e8%8f%9c%e5%8d%95">
       <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-has-children menu-item-71" id="menu-item-71">
        <a href="http://dataunion.org/category/events" title="events">
         活动
        </a>
        <ul class="sub-menu">
         <li class="menu-item menu-item-type-post_type menu-item-object-page menu-item-22457" id="menu-item-22457">
          <a href="http://dataunion.org/2016timeline">
           2016档期
          </a>
         </li>
         <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-22459" id="menu-item-22459">
          <a href="http://dataunion.org/category/parterc">
           合作会议
          </a>
         </li>
        </ul>
       </li>
       <li class="menu-item menu-item-type-taxonomy menu-item-object-category current-post-ancestor menu-item-has-children menu-item-20869" id="menu-item-20869">
        <a href="http://dataunion.org/category/tech" title="articles">
         文章
        </a>
        <ul class="sub-menu">
         <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-20867" id="menu-item-20867">
          <a href="http://dataunion.org/category/tech/base" title="base">
           基础架构
          </a>
         </li>
         <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-3302" id="menu-item-3302">
          <a href="http://dataunion.org/category/tech/ai" title="ai">
           人工智能
          </a>
         </li>
         <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-3303" id="menu-item-3303">
          <a href="http://dataunion.org/category/tech/analysis" title="analysis">
           数据分析
          </a>
         </li>
         <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-21920" id="menu-item-21920">
          <a href="http://dataunion.org/category/tech/dm">
           数据挖掘
          </a>
         </li>
         <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-3314" id="menu-item-3314">
          <a href="http://dataunion.org/category/tech/viz" title="viz">
           可视化
          </a>
         </li>
         <li class="menu-item menu-item-type-taxonomy menu-item-object-category current-post-ancestor current-menu-parent current-post-parent menu-item-3305" id="menu-item-3305">
          <a href="http://dataunion.org/category/tech/devl" title="devl">
           编程语言
          </a>
         </li>
        </ul>
       </li>
       <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-has-children menu-item-20876" id="menu-item-20876">
        <a href="http://dataunion.org/category/industry">
         行业
        </a>
        <ul class="sub-menu">
         <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-16328" id="menu-item-16328">
          <a href="http://dataunion.org/category/industry/case" title="case">
           行业应用
          </a>
         </li>
         <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-2112" id="menu-item-2112">
          <a href="http://dataunion.org/category/industry/demo" title="demo">
           Demo展示
          </a>
         </li>
         <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-21562" id="menu-item-21562">
          <a href="http://dataunion.org/category/industry/news">
           行业资讯
          </a>
         </li>
        </ul>
       </li>
       <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-311" id="menu-item-311">
        <a href="http://dataunion.org/category/sources" title="sources">
         资源
        </a>
       </li>
       <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-20870" id="menu-item-20870">
        <a href="http://dataunion.org/category/books" title="book">
         图书
        </a>
       </li>
       <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-21363" id="menu-item-21363">
        <a href="http://dataunion.org/category/training">
         课程
        </a>
       </li>
       <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-has-children menu-item-21853" id="menu-item-21853">
        <a href="http://dataunion.org/category/jobs">
         职位
        </a>
        <ul class="sub-menu">
         <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-22050" id="menu-item-22050">
          <a href="http://dataunion.org/category/career">
           职业规划
          </a>
         </li>
        </ul>
       </li>
      </ul>
     </nav>
     <!--header-nav-->
    </div>
   </header>
   <!--header-web-->
   <div id="main">
    <div id="soutab">
     <form action="http://dataunion.org/" class="search" method="get">
     </form>
    </div>
    <div id="container">
     <nav id="mbx">
      当前位置：
      <a href="http://dataunion.org">
       首页
      </a>
      &gt;
      <a href="http://dataunion.org/category/tech">
       文章
      </a>
      &gt;
      <a href="http://dataunion.org/category/tech/devl">
       编程语言
      </a>
      &gt;  正文
     </nav>
     <!--mbx-->
     <article class="content">
      <header align="centre" class="contenttitle">
       <div class="mscc">
        <h1 class="mscctitle">
         <a href="http://dataunion.org/22496.html">
          【投稿】Machine Learning With Spark Note 1:数据基本处理
         </a>
        </h1>
        <address class="msccaddress ">
         <em>
          888 次阅读 -
         </em>
         <a href="http://dataunion.org/category/tech/devl" rel="category tag">
          编程语言
         </a>
        </address>
       </div>
      </header>
      <div class="content-text">
       <p>
        本文为数盟特约作者投稿，欢迎转载，请注明出处“数盟社区”和作者
       </p>
       <p>
        博主简介：段石石，1号店精准化推荐算法工程师，主要负责1号店用户画像构建，喜欢钻研点Machine Learning的黑科技，对Deep Learning感兴趣，喜欢玩kaggle、看9神，对数据和Machine Learning有兴趣咱们可以一起聊聊，个人博客：
        <a href="http://hacker.duanshishi.com" target="_blank">
         hacker.duanshishi.com
        </a>
       </p>
       <h3>
        接入公共数据库
       </h3>
       <p>
        很用于机器学习模型的数据库有很多，包括：
       </p>
       <ul>
        <li>
         UCI机器学习源：http://archive.ics.uci.edu/ml/
        </li>
        <li>
         Amazon AWS公共数据集：http://aws.amazon. com/publicdatasets/
        </li>
        <li>
         Kaggle:http://www.kaggle.com/competitions
        </li>
        <li>
         KDnuggets:http://www.kdnuggets.com/datasets/index.html
        </li>
       </ul>
       <p>
        在本章中，我们使用一个经典的电影数据集MovieLens（http://files.grouplens.org/datasets/ movielens/ml-100k.zip）
       </p>
       <h2>
        了解数据
       </h2>
       <p>
        下载MovieLens数据集：
       </p>
       <p>
        <img src="http://blogburness1-wordpress.stor.sinaapp.com/uploads/2015/12/NewImage24.png"/>
       </p>
       <p>
        解压，检查下数据格式：
       </p>
       <p>
        <img src="http://blogburness1-wordpress.stor.sinaapp.com/uploads/2015/12/NewImage25.png"/>
        <img src="http://blogburness1-wordpress.stor.sinaapp.com/uploads/2015/12/NewImage27.png"/>
       </p>
       <p>
        <img src="http://blogburness1-wordpress.stor.sinaapp.com/uploads/2015/12/NewImage28.png"/>
       </p>
       <p>
        u.user存储用户基本信息，u.item存储电影基本信息，u.data存储user_id,movie_id,rating,timestamp信息。
       </p>
       <h3>
        使用Python notebook看看用户数据
       </h3>
       <p>
        将数据拷到对应路径
       </p>
       <!-- Crayon Syntax Highlighter v_2.7.2_beta -->
       <div class="crayon-syntax crayon-theme-classic crayon-font-monaco crayon-os-pc print-yes notranslate" data-settings=" minimize scroll-mouseover" id="crayon-57686696bb5b5866115864" style=" margin-top: 12px; margin-bottom: 12px; font-size: 12px !important; line-height: 15px !important;">
        <div class="crayon-toolbar" data-settings=" mouseover overlay hide delay" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
         <span class="crayon-title">
         </span>
         <div class="crayon-tools" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
          <div class="crayon-button crayon-nums-button" title="切换是否显示行编号">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-plain-button" title="纯文本显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-wrap-button" title="切换自动换行">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-expand-button" title="点击展开代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-copy-button" title="复制代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-popup-button" title="在新窗口中显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <span class="crayon-language">
           Python
          </span>
         </div>
        </div>
        <div class="crayon-info" style="min-height: 16.8px !important; line-height: 16.8px !important;">
        </div>
        <div class="crayon-plain-wrap">
         <textarea class="crayon-plain print-no" data-settings="dblclick" readonly="" style="-moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4; font-size: 12px !important; line-height: 15px !important;" wrap="soft">
          user_data = sc.textFile('../data/ML_spark/MovieLens/u.user')
user_data.first()
         </textarea>
        </div>
        <div class="crayon-main" style="">
         <table class="crayon-table">
          <tbody>
           <tr class="crayon-row">
            <td class="crayon-nums " data-settings="show">
             <div class="crayon-nums-content" style="font-size: 12px !important; line-height: 15px !important;">
              <div class="crayon-num" data-line="crayon-57686696bb5b5866115864-1">
               1
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb5b5866115864-2">
               2
              </div>
             </div>
            </td>
            <td class="crayon-code">
             <div class="crayon-pre" style="font-size: 12px !important; line-height: 15px !important; -moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4;">
              <div class="crayon-line" id="crayon-57686696bb5b5866115864-1">
               <span class="crayon-v">
                user_data
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                sc
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                textFile
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-s">
                '../data/ML_spark/MovieLens/u.user'
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb5b5866115864-2">
               <span class="crayon-v">
                user_data
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                first
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <!-- [Format Time: 0.0014 seconds] -->
       <p>
       </p>
       <p>
        <img src="http://blogburness1-wordpress.stor.sinaapp.com/uploads/2015/12/NewImage29.png"/>
       </p>
       <p>
        计算数据当中的基本信息，比如用户总数、性别总数（应该是2吧）、职业数、zip数目：
       </p>
       <!-- Crayon Syntax Highlighter v_2.7.2_beta -->
       <div class="crayon-syntax crayon-theme-classic crayon-font-monaco crayon-os-pc print-yes notranslate" data-settings=" minimize scroll-mouseover" id="crayon-57686696bb5c5551046293" style=" margin-top: 12px; margin-bottom: 12px; font-size: 12px !important; line-height: 15px !important;">
        <div class="crayon-toolbar" data-settings=" mouseover overlay hide delay" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
         <span class="crayon-title">
         </span>
         <div class="crayon-tools" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
          <div class="crayon-button crayon-nums-button" title="切换是否显示行编号">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-plain-button" title="纯文本显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-wrap-button" title="切换自动换行">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-expand-button" title="点击展开代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-copy-button" title="复制代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-popup-button" title="在新窗口中显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <span class="crayon-language">
           Python
          </span>
         </div>
        </div>
        <div class="crayon-info" style="min-height: 16.8px !important; line-height: 16.8px !important;">
        </div>
        <div class="crayon-plain-wrap">
         <textarea class="crayon-plain print-no" data-settings="dblclick" readonly="" style="-moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4; font-size: 12px !important; line-height: 15px !important;" wrap="soft">
          user_fields = user_data.map(lambda line: line.split('|'))
num_users = user_fields.map(lambda fields: fields[0]).count()
 = user_fields.map(lambda fields : fields[2]).distinct().count()
num_occupations = user_fields.map(lambda fields: fields[3]).distinct().count()
num_zipcodes = user_fields.map(lambda fields: fields[4]).distinct().count()
print "Users: %d, genders: %d, occupations: %d, ZIP codes: %d"%(num_users,num_genders,num_occupations,num_zipcodes)
         </textarea>
        </div>
        <div class="crayon-main" style="">
         <table class="crayon-table">
          <tbody>
           <tr class="crayon-row">
            <td class="crayon-nums " data-settings="show">
             <div class="crayon-nums-content" style="font-size: 12px !important; line-height: 15px !important;">
              <div class="crayon-num" data-line="crayon-57686696bb5c5551046293-1">
               1
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb5c5551046293-2">
               2
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb5c5551046293-3">
               3
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb5c5551046293-4">
               4
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb5c5551046293-5">
               5
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb5c5551046293-6">
               6
              </div>
             </div>
            </td>
            <td class="crayon-code">
             <div class="crayon-pre" style="font-size: 12px !important; line-height: 15px !important; -moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4;">
              <div class="crayon-line" id="crayon-57686696bb5c5551046293-1">
               <span class="crayon-v">
                user_fields
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                user_data
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                line
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                line
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                split
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-s">
                '|'
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb5c5551046293-2">
               <span class="crayon-v">
                num_users
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                user_fields
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                fields
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                fields
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-cn">
                0
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                count
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb5c5551046293-3">
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                user_fields
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                fields
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                fields
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-cn">
                2
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                distinct
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                count
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb5c5551046293-4">
               <span class="crayon-v">
                num_occupations
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                user_fields
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                fields
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                fields
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-cn">
                3
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                distinct
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                count
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb5c5551046293-5">
               <span class="crayon-v">
                num_zipcodes
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                user_fields
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                fields
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                fields
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-cn">
                4
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                distinct
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                count
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb5c5551046293-6">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                "Users: %d, genders: %d, occupations: %d, ZIP codes: %d"
               </span>
               <span class="crayon-o">
                %
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                num_users
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                num_genders
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                num_occupations
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                num_zipcodes
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <!-- [Format Time: 0.0340 seconds] -->
       <p>
       </p>
       <p>
        <img src="http://blogburness1-wordpress.stor.sinaapp.com/uploads/2015/12/NewImage30.png"/>
       </p>
       <p>
        计算用户的年纪的基本分布：
       </p>
       <!-- Crayon Syntax Highlighter v_2.7.2_beta -->
       <div class="crayon-syntax crayon-theme-classic crayon-font-monaco crayon-os-pc print-yes notranslate" data-settings=" minimize scroll-mouseover" id="crayon-57686696bb5ce379190889" style=" margin-top: 12px; margin-bottom: 12px; font-size: 12px !important; line-height: 15px !important;">
        <div class="crayon-toolbar" data-settings=" mouseover overlay hide delay" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
         <span class="crayon-title">
         </span>
         <div class="crayon-tools" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
          <div class="crayon-button crayon-nums-button" title="切换是否显示行编号">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-plain-button" title="纯文本显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-wrap-button" title="切换自动换行">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-expand-button" title="点击展开代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-copy-button" title="复制代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-popup-button" title="在新窗口中显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <span class="crayon-language">
           Python
          </span>
         </div>
        </div>
        <div class="crayon-info" style="min-height: 16.8px !important; line-height: 16.8px !important;">
        </div>
        <div class="crayon-plain-wrap">
         <textarea class="crayon-plain print-no" data-settings="dblclick" readonly="" style="-moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4; font-size: 12px !important; line-height: 15px !important;" wrap="soft">
          import matplotlib.pyplot as plt
from matplotlib.pyplot import hist
ages = user_fields.map(lambda x: int(x[1])).collect()
hist(ages, bins=20, color='lightblue',normed=True)
fig = plt.gcf()
fig.set_size_inches(16,10)
         </textarea>
        </div>
        <div class="crayon-main" style="">
         <table class="crayon-table">
          <tbody>
           <tr class="crayon-row">
            <td class="crayon-nums " data-settings="show">
             <div class="crayon-nums-content" style="font-size: 12px !important; line-height: 15px !important;">
              <div class="crayon-num" data-line="crayon-57686696bb5ce379190889-1">
               1
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb5ce379190889-2">
               2
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb5ce379190889-3">
               3
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb5ce379190889-4">
               4
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb5ce379190889-5">
               5
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb5ce379190889-6">
               6
              </div>
             </div>
            </td>
            <td class="crayon-code">
             <div class="crayon-pre" style="font-size: 12px !important; line-height: 15px !important; -moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4;">
              <div class="crayon-line" id="crayon-57686696bb5ce379190889-1">
               <span class="crayon-r">
                import
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                matplotlib
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                pyplot
               </span>
               <span class="crayon-st">
                as
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                plt
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb5ce379190889-2">
               <span class="crayon-st">
                from
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                matplotlib
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                pyplot
               </span>
               <span class="crayon-r">
                import
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                hist
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb5ce379190889-3">
               <span class="crayon-v">
                ages
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                user_fields
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-k ">
                int
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-cn">
                1
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                collect
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb5ce379190889-4">
               <span class="crayon-e">
                hist
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                ages
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                bins
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-cn">
                20
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                color
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-s">
                'lightblue'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                normed
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-t">
                True
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb5ce379190889-5">
               <span class="crayon-v">
                fig
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                plt
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                gcf
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb5ce379190889-6">
               <span class="crayon-v">
                fig
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                set_size_inches
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-cn">
                16
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-cn">
                10
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <!-- [Format Time: 0.0167 seconds] -->
       <p>
       </p>
       <p>
        <img src="http://blogburness1-wordpress.stor.sinaapp.com/uploads/2015/12/NewImage31.png"/>
       </p>
       <p>
        计算用户的职业的分布：
       </p>
       <!-- Crayon Syntax Highlighter v_2.7.2_beta -->
       <div class="crayon-syntax crayon-theme-classic crayon-font-monaco crayon-os-pc print-yes notranslate" data-settings=" minimize scroll-mouseover" id="crayon-57686696bb5d5194232866" style=" margin-top: 12px; margin-bottom: 12px; font-size: 12px !important; line-height: 15px !important;">
        <div class="crayon-toolbar" data-settings=" mouseover overlay hide delay" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
         <span class="crayon-title">
         </span>
         <div class="crayon-tools" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
          <div class="crayon-button crayon-nums-button" title="切换是否显示行编号">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-plain-button" title="纯文本显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-wrap-button" title="切换自动换行">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-expand-button" title="点击展开代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-copy-button" title="复制代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-popup-button" title="在新窗口中显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <span class="crayon-language">
           Python
          </span>
         </div>
        </div>
        <div class="crayon-info" style="min-height: 16.8px !important; line-height: 16.8px !important;">
        </div>
        <div class="crayon-plain-wrap">
         <textarea class="crayon-plain print-no" data-settings="dblclick" readonly="" style="-moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4; font-size: 12px !important; line-height: 15px !important;" wrap="soft">
          import numpy as np
count_by_occupation = user_fields.map(lambda fields: (fields[3],1)).reduceByKey(lambda x,y:x+y).collect()
print count_by_occupation
x_axis1 = np.array([c[0] for c in count_by_occupation])
y_axis1 = np.array([c[1] for c in count_by_occupation])
x_axis = x_axis1[np.argsort(y_axis1)]
y_axis = y_axis1[np.argsort(y_axis1)]
pos = np.arange(len(x_axis))
width = 1.0
ax = plt.axes()
ax.set_xticks(pos+(width)/2)
ax.set_xticklabels(x_axis)

plt.bar(pos, y_axis, width, color='lightblue')
plt.xticks(rotation=30)
fig = plt.gcf()
fig.set_size_inches(10,6)
         </textarea>
        </div>
        <div class="crayon-main" style="">
         <table class="crayon-table">
          <tbody>
           <tr class="crayon-row">
            <td class="crayon-nums " data-settings="show">
             <div class="crayon-nums-content" style="font-size: 12px !important; line-height: 15px !important;">
              <div class="crayon-num" data-line="crayon-57686696bb5d5194232866-1">
               1
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb5d5194232866-2">
               2
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb5d5194232866-3">
               3
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb5d5194232866-4">
               4
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb5d5194232866-5">
               5
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb5d5194232866-6">
               6
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb5d5194232866-7">
               7
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb5d5194232866-8">
               8
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb5d5194232866-9">
               9
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb5d5194232866-10">
               10
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb5d5194232866-11">
               11
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb5d5194232866-12">
               12
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb5d5194232866-13">
               13
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb5d5194232866-14">
               14
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb5d5194232866-15">
               15
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb5d5194232866-16">
               16
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb5d5194232866-17">
               17
              </div>
             </div>
            </td>
            <td class="crayon-code">
             <div class="crayon-pre" style="font-size: 12px !important; line-height: 15px !important; -moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4;">
              <div class="crayon-line" id="crayon-57686696bb5d5194232866-1">
               <span class="crayon-r">
                import
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                numpy
               </span>
               <span class="crayon-st">
                as
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                np
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb5d5194232866-2">
               <span class="crayon-v">
                count_by_occupation
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                user_fields
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                fields
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                fields
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-cn">
                3
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-cn">
                1
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                reduceByKey
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                y
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-o">
                +
               </span>
               <span class="crayon-v">
                y
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                collect
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb5d5194232866-3">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                count_by_occupation
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb5d5194232866-4">
               <span class="crayon-v">
                x_axis1
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                np
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                array
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-v">
                c
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-cn">
                0
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                for
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                c
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                in
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                count_by_occupation
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb5d5194232866-5">
               <span class="crayon-v">
                y_axis1
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                np
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                array
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-v">
                c
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-cn">
                1
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                for
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                c
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                in
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                count_by_occupation
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb5d5194232866-6">
               <span class="crayon-v">
                x_axis
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                x_axis1
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-v">
                np
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                argsort
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                y_axis1
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                ]
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb5d5194232866-7">
               <span class="crayon-v">
                y_axis
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                y_axis1
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-v">
                np
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                argsort
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                y_axis1
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                ]
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb5d5194232866-8">
               <span class="crayon-v">
                pos
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                np
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                arange
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-k ">
                len
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                x_axis
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb5d5194232866-9">
               <span class="crayon-v">
                width
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                1.0
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb5d5194232866-10">
               <span class="crayon-v">
                ax
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                plt
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                axes
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb5d5194232866-11">
               <span class="crayon-v">
                ax
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                set_xticks
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                pos
               </span>
               <span class="crayon-o">
                +
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                width
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-o">
                /
               </span>
               <span class="crayon-cn">
                2
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb5d5194232866-12">
               <span class="crayon-v">
                ax
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                set_xticklabels
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                x_axis
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb5d5194232866-13">
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb5d5194232866-14">
               <span class="crayon-v">
                plt
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                bar
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                pos
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                y_axis
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                width
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                color
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-s">
                'lightblue'
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb5d5194232866-15">
               <span class="crayon-v">
                plt
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                xticks
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                rotation
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-cn">
                30
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb5d5194232866-16">
               <span class="crayon-v">
                fig
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                plt
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                gcf
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb5d5194232866-17">
               <span class="crayon-v">
                fig
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                set_size_inches
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-cn">
                10
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-cn">
                6
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <!-- [Format Time: 0.0474 seconds] -->
       <p>
       </p>
       <div class="crayon-syntax crayon-theme-classic crayon-font-monaco crayon-os-pc print-yes notranslate" data-settings=" minimize scroll-mouseover" id="crayon-57686696bb5dc986105601" style=" margin-top: 12px; margin-bottom: 12px; font-size: 12px !important; line-height: 15px !important;">
        <div class="crayon-toolbar" data-settings=" mouseover overlay hide delay" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
         <span class="crayon-title">
         </span>
         <div class="crayon-tools" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
          <div class="crayon-button crayon-nums-button" title="切换是否显示行编号">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-plain-button" title="纯文本显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-wrap-button" title="切换自动换行">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-expand-button" title="点击展开代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-copy-button" title="复制代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-popup-button" title="在新窗口中显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
         </div>
        </div>
        <div class="crayon-info" style="min-height: 16.8px !important; line-height: 16.8px !important;">
        </div>
        <div class="crayon-plain-wrap">
         <textarea class="crayon-plain print-no" data-settings="dblclick" readonly="" style="-moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4; font-size: 12px !important; line-height: 15px !important;" wrap="soft">
         </textarea>
        </div>
        <div class="crayon-main" style="">
         <table class="crayon-table">
          <tbody>
           <tr class="crayon-row">
            <td class="crayon-nums " data-settings="show">
             <div class="crayon-nums-content" style="font-size: 12px !important; line-height: 15px !important;">
              <div class="crayon-num" data-line="crayon-57686696bb5dc986105601-1">
               1
              </div>
             </div>
            </td>
            <td class="crayon-code">
             <div class="crayon-pre" style="font-size: 12px !important; line-height: 15px !important; -moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4;">
              <div class="crayon-line" id="crayon-57686696bb5dc986105601-1">
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <p>
       </p>
       <p>
        <img src="http://blogburness1-wordpress.stor.sinaapp.com/uploads/2015/12/NewImage35.png"/>
       </p>
       <p>
        <img src="http://blogburness1-wordpress.stor.sinaapp.com/uploads/2015/12/NewImage33.png"/>
       </p>
       <h3>
        使用IPython notebook看看电影数据
       </h3>
       <p>
        读入电影数据，计算总数：
       </p>
       <p>
       </p>
       <!-- Crayon Syntax Highlighter v_2.7.2_beta -->
       <div class="crayon-syntax crayon-theme-classic crayon-font-monaco crayon-os-pc print-yes notranslate" data-settings=" minimize scroll-mouseover" id="crayon-57686696bb5e3048905705" style=" margin-top: 12px; margin-bottom: 12px; font-size: 12px !important; line-height: 15px !important;">
        <div class="crayon-toolbar" data-settings=" mouseover overlay hide delay" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
         <span class="crayon-title">
         </span>
         <div class="crayon-tools" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
          <div class="crayon-button crayon-nums-button" title="切换是否显示行编号">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-plain-button" title="纯文本显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-wrap-button" title="切换自动换行">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-expand-button" title="点击展开代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-copy-button" title="复制代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-popup-button" title="在新窗口中显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <span class="crayon-language">
           Python
          </span>
         </div>
        </div>
        <div class="crayon-info" style="min-height: 16.8px !important; line-height: 16.8px !important;">
        </div>
        <div class="crayon-plain-wrap">
         <textarea class="crayon-plain print-no" data-settings="dblclick" readonly="" style="-moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4; font-size: 12px !important; line-height: 15px !important;" wrap="soft">
          movie_data = sc.textFile("../data/ML_spark/MovieLens/u.item")
print movie_data.first()
num_movies = movie_data.count()
print 'Movies: %d' % num_movies
         </textarea>
        </div>
        <div class="crayon-main" style="">
         <table class="crayon-table">
          <tbody>
           <tr class="crayon-row">
            <td class="crayon-nums " data-settings="show">
             <div class="crayon-nums-content" style="font-size: 12px !important; line-height: 15px !important;">
              <div class="crayon-num" data-line="crayon-57686696bb5e3048905705-1">
               1
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb5e3048905705-2">
               2
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb5e3048905705-3">
               3
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb5e3048905705-4">
               4
              </div>
             </div>
            </td>
            <td class="crayon-code">
             <div class="crayon-pre" style="font-size: 12px !important; line-height: 15px !important; -moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4;">
              <div class="crayon-line" id="crayon-57686696bb5e3048905705-1">
               <span class="crayon-v">
                movie_data
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                sc
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                textFile
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-s">
                "../data/ML_spark/MovieLens/u.item"
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb5e3048905705-2">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                movie_data
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                first
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb5e3048905705-3">
               <span class="crayon-v">
                num_movies
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                movie_data
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                count
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb5e3048905705-4">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                'Movies: %d'
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                %
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                num_movies
               </span>
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <!-- [Format Time: 0.0019 seconds] -->
       <p>
       </p>
       <p>
        <img src="http://blogburness1-wordpress.stor.sinaapp.com/uploads/2015/12/NewImage36.png"/>
       </p>
       <p>
        计算电影的age分布：
       </p>
       <!-- Crayon Syntax Highlighter v_2.7.2_beta -->
       <div class="crayon-syntax crayon-theme-classic crayon-font-monaco crayon-os-pc print-yes notranslate" data-settings=" minimize scroll-mouseover" id="crayon-57686696bb5e9599698960" style=" margin-top: 12px; margin-bottom: 12px; font-size: 12px !important; line-height: 15px !important;">
        <div class="crayon-toolbar" data-settings=" mouseover overlay hide delay" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
         <span class="crayon-title">
         </span>
         <div class="crayon-tools" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
          <div class="crayon-button crayon-nums-button" title="切换是否显示行编号">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-plain-button" title="纯文本显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-wrap-button" title="切换自动换行">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-expand-button" title="点击展开代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-copy-button" title="复制代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-popup-button" title="在新窗口中显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <span class="crayon-language">
           Python
          </span>
         </div>
        </div>
        <div class="crayon-info" style="min-height: 16.8px !important; line-height: 16.8px !important;">
        </div>
        <div class="crayon-plain-wrap">
         <textarea class="crayon-plain print-no" data-settings="dblclick" readonly="" style="-moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4; font-size: 12px !important; line-height: 15px !important;" wrap="soft">
          def convert_year(x):
    try:
        return int(x[-4:])
    except:
        return 1900

movie_fields = movie_data.map(lambda lines:lines.split('|'))
years = movie_fields.map(lambda fields: fields[2]).map(lambda x: convert_year(x))
years_filtered = years.filter(lambda x: x!=1900)
print years_filtered.count()
movie_ages = years_filtered.map(lambda yr:1998-yr).countByValue()
values = movie_ages.values()
bins = movie_ages.keys()
hist(values, bins=bins, color='lightblue',normed=True)
fig = plt.gcf()
fig.set_size_inches(8,5)
         </textarea>
        </div>
        <div class="crayon-main" style="">
         <table class="crayon-table">
          <tbody>
           <tr class="crayon-row">
            <td class="crayon-nums " data-settings="show">
             <div class="crayon-nums-content" style="font-size: 12px !important; line-height: 15px !important;">
              <div class="crayon-num" data-line="crayon-57686696bb5e9599698960-1">
               1
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb5e9599698960-2">
               2
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb5e9599698960-3">
               3
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb5e9599698960-4">
               4
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb5e9599698960-5">
               5
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb5e9599698960-6">
               6
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb5e9599698960-7">
               7
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb5e9599698960-8">
               8
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb5e9599698960-9">
               9
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb5e9599698960-10">
               10
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb5e9599698960-11">
               11
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb5e9599698960-12">
               12
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb5e9599698960-13">
               13
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb5e9599698960-14">
               14
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb5e9599698960-15">
               15
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb5e9599698960-16">
               16
              </div>
             </div>
            </td>
            <td class="crayon-code">
             <div class="crayon-pre" style="font-size: 12px !important; line-height: 15px !important; -moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4;">
              <div class="crayon-line" id="crayon-57686696bb5e9599698960-1">
               <span class="crayon-r">
                def
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                convert_year
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-o">
                :
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb5e9599698960-2">
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                try
               </span>
               <span class="crayon-o">
                :
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb5e9599698960-3">
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                return
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-k ">
                int
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-o">
                -
               </span>
               <span class="crayon-cn">
                4
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb5e9599698960-4">
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                except
               </span>
               <span class="crayon-o">
                :
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb5e9599698960-5">
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                return
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                1900
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb5e9599698960-6">
              </div>
              <div class="crayon-line" id="crayon-57686696bb5e9599698960-7">
               <span class="crayon-v">
                movie_fields
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                movie_data
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                lines
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-v">
                lines
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                split
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-s">
                '|'
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb5e9599698960-8">
               <span class="crayon-v">
                years
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                movie_fields
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                fields
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                fields
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-cn">
                2
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                convert_year
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb5e9599698960-9">
               <span class="crayon-v">
                years_filtered
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                years
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                filter
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-o">
                !=
               </span>
               <span class="crayon-cn">
                1900
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb5e9599698960-10">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                years_filtered
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                count
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb5e9599698960-11">
               <span class="crayon-v">
                movie_ages
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                years_filtered
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                yr
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-cn">
                1998
               </span>
               <span class="crayon-o">
                -
               </span>
               <span class="crayon-v">
                yr
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                countByValue
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb5e9599698960-12">
               <span class="crayon-v">
                values
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                movie_ages
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                values
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb5e9599698960-13">
               <span class="crayon-v">
                bins
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                movie_ages
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                keys
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb5e9599698960-14">
               <span class="crayon-e">
                hist
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                values
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                bins
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-v">
                bins
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                color
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-s">
                'lightblue'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                normed
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-t">
                True
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb5e9599698960-15">
               <span class="crayon-v">
                fig
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                plt
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                gcf
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb5e9599698960-16">
               <span class="crayon-v">
                fig
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                set_size_inches
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-cn">
                8
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-cn">
                5
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <!-- [Format Time: 0.0305 seconds] -->
       <p>
       </p>
       <p>
        <img src="http://blogburness1-wordpress.stor.sinaapp.com/uploads/2015/12/NewImage37.png"/>
       </p>
       <h3>
        使用Ipython notebook看看用户对电影排序的数据集
       </h3>
       <p>
        查看数据记录数量：
       </p>
       <!-- Crayon Syntax Highlighter v_2.7.2_beta -->
       <div class="crayon-syntax crayon-theme-classic crayon-font-monaco crayon-os-pc print-yes notranslate" data-settings=" minimize scroll-mouseover" id="crayon-57686696bb5ef146817249" style=" margin-top: 12px; margin-bottom: 12px; font-size: 12px !important; line-height: 15px !important;">
        <div class="crayon-toolbar" data-settings=" mouseover overlay hide delay" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
         <span class="crayon-title">
         </span>
         <div class="crayon-tools" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
          <div class="crayon-button crayon-nums-button" title="切换是否显示行编号">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-plain-button" title="纯文本显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-wrap-button" title="切换自动换行">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-expand-button" title="点击展开代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-copy-button" title="复制代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-popup-button" title="在新窗口中显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <span class="crayon-language">
           Python
          </span>
         </div>
        </div>
        <div class="crayon-info" style="min-height: 16.8px !important; line-height: 16.8px !important;">
        </div>
        <div class="crayon-plain-wrap">
         <textarea class="crayon-plain print-no" data-settings="dblclick" readonly="" style="-moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4; font-size: 12px !important; line-height: 15px !important;" wrap="soft">
          rating_data = sc.textFile('../data/ML_spark/MovieLens/u.data')
print rating_data.first()
num_ratings = rating_data.count()
print 'Ratings: %d'% num_ratings
         </textarea>
        </div>
        <div class="crayon-main" style="">
         <table class="crayon-table">
          <tbody>
           <tr class="crayon-row">
            <td class="crayon-nums " data-settings="show">
             <div class="crayon-nums-content" style="font-size: 12px !important; line-height: 15px !important;">
              <div class="crayon-num" data-line="crayon-57686696bb5ef146817249-1">
               1
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb5ef146817249-2">
               2
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb5ef146817249-3">
               3
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb5ef146817249-4">
               4
              </div>
             </div>
            </td>
            <td class="crayon-code">
             <div class="crayon-pre" style="font-size: 12px !important; line-height: 15px !important; -moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4;">
              <div class="crayon-line" id="crayon-57686696bb5ef146817249-1">
               <span class="crayon-v">
                rating_data
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                sc
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                textFile
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-s">
                '../data/ML_spark/MovieLens/u.data'
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb5ef146817249-2">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                rating_data
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                first
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb5ef146817249-3">
               <span class="crayon-v">
                num_ratings
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                rating_data
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                count
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb5ef146817249-4">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                'Ratings: %d'
               </span>
               <span class="crayon-o">
                %
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                num_ratings
               </span>
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <!-- [Format Time: 0.0021 seconds] -->
       <p>
       </p>
       <p>
        对数据进行一些基本的统计：
       </p>
       <!-- Crayon Syntax Highlighter v_2.7.2_beta -->
       <div class="crayon-syntax crayon-theme-classic crayon-font-monaco crayon-os-pc print-yes notranslate" data-settings=" minimize scroll-mouseover" id="crayon-57686696bb5f5682176964" style=" margin-top: 12px; margin-bottom: 12px; font-size: 12px !important; line-height: 15px !important;">
        <div class="crayon-toolbar" data-settings=" mouseover overlay hide delay" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
         <span class="crayon-title">
         </span>
         <div class="crayon-tools" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
          <div class="crayon-button crayon-nums-button" title="切换是否显示行编号">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-plain-button" title="纯文本显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-wrap-button" title="切换自动换行">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-expand-button" title="点击展开代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-copy-button" title="复制代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-popup-button" title="在新窗口中显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <span class="crayon-language">
           Python
          </span>
         </div>
        </div>
        <div class="crayon-info" style="min-height: 16.8px !important; line-height: 16.8px !important;">
        </div>
        <div class="crayon-plain-wrap">
         <textarea class="crayon-plain print-no" data-settings="dblclick" readonly="" style="-moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4; font-size: 12px !important; line-height: 15px !important;" wrap="soft">
          rating_data = rating_data.map(lambda line: line.split('\t'))
ratings = rating_data.map(lambda fields: int(fields[2]))
max_rating = ratings.reduce(lambda x,y:max(x,y))
min_rating = ratings.reduce(lambda x,y:min(x,y))
mean_rating = ratings.reduce(lambda x,y:x+y)/num_ratings
median_rating = np.median(ratings.collect())
ratings_per_user = num_ratings/num_users;
ratings_per_movie = num_ratings/ num_movies
print 'Min rating: %d' %min_rating
print 'max rating: %d' % max_rating
print 'Average rating: %2.2f' %mean_rating
print 'Median rating: %d '%median_rating
print 'Average # of ratings per user: %2.2f'%ratings_per_user
print 'Average # of ratings per movie: %2.2f' % ratings_per_movie
         </textarea>
        </div>
        <div class="crayon-main" style="">
         <table class="crayon-table">
          <tbody>
           <tr class="crayon-row">
            <td class="crayon-nums " data-settings="show">
             <div class="crayon-nums-content" style="font-size: 12px !important; line-height: 15px !important;">
              <div class="crayon-num" data-line="crayon-57686696bb5f5682176964-1">
               1
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb5f5682176964-2">
               2
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb5f5682176964-3">
               3
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb5f5682176964-4">
               4
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb5f5682176964-5">
               5
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb5f5682176964-6">
               6
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb5f5682176964-7">
               7
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb5f5682176964-8">
               8
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb5f5682176964-9">
               9
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb5f5682176964-10">
               10
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb5f5682176964-11">
               11
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb5f5682176964-12">
               12
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb5f5682176964-13">
               13
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb5f5682176964-14">
               14
              </div>
             </div>
            </td>
            <td class="crayon-code">
             <div class="crayon-pre" style="font-size: 12px !important; line-height: 15px !important; -moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4;">
              <div class="crayon-line" id="crayon-57686696bb5f5682176964-1">
               <span class="crayon-v">
                rating_data
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                rating_data
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                line
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                line
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                split
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-s">
                '\t'
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb5f5682176964-2">
               <span class="crayon-v">
                ratings
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                rating_data
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                fields
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-k ">
                int
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                fields
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-cn">
                2
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb5f5682176964-3">
               <span class="crayon-v">
                max_rating
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                ratings
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                reduce
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                y
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-k ">
                max
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                y
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb5f5682176964-4">
               <span class="crayon-v">
                min_rating
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                ratings
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                reduce
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                y
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-k ">
                min
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                y
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb5f5682176964-5">
               <span class="crayon-v">
                mean_rating
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                ratings
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                reduce
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                y
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-o">
                +
               </span>
               <span class="crayon-v">
                y
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-o">
                /
               </span>
               <span class="crayon-e">
                num_ratings
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb5f5682176964-6">
               <span class="crayon-v">
                median_rating
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                np
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                median
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                ratings
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                collect
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb5f5682176964-7">
               <span class="crayon-v">
                ratings_per_user
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                num_ratings
               </span>
               <span class="crayon-o">
                /
               </span>
               <span class="crayon-v">
                num_users
               </span>
               <span class="crayon-sy">
                ;
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb5f5682176964-8">
               <span class="crayon-v">
                ratings_per_movie
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                num_ratings
               </span>
               <span class="crayon-o">
                /
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                num_movies
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb5f5682176964-9">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                'Min rating: %d'
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                %
               </span>
               <span class="crayon-e">
                min_rating
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb5f5682176964-10">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                'max rating: %d'
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                %
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                max_rating
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb5f5682176964-11">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                'Average rating: %2.2f'
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                %
               </span>
               <span class="crayon-e">
                mean_rating
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb5f5682176964-12">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                'Median rating: %d '
               </span>
               <span class="crayon-o">
                %
               </span>
               <span class="crayon-e">
                median_rating
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb5f5682176964-13">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                'Average # of ratings per user: %2.2f'
               </span>
               <span class="crayon-o">
                %
               </span>
               <span class="crayon-e">
                ratings_per_user
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb5f5682176964-14">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                'Average # of ratings per movie: %2.2f'
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                %
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                ratings_per_movie
               </span>
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <!-- [Format Time: 0.0277 seconds] -->
       <p>
       </p>
       <p>
        <img src="http://blogburness1-wordpress.stor.sinaapp.com/uploads/2015/12/NewImage38.png"/>
       </p>
       <p>
        计算ratings value的分布：
       </p>
       <!-- Crayon Syntax Highlighter v_2.7.2_beta -->
       <div class="crayon-syntax crayon-theme-classic crayon-font-monaco crayon-os-pc print-yes notranslate" data-settings=" minimize scroll-mouseover" id="crayon-57686696bb5fc668916551" style=" margin-top: 12px; margin-bottom: 12px; font-size: 12px !important; line-height: 15px !important;">
        <div class="crayon-toolbar" data-settings=" mouseover overlay hide delay" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
         <span class="crayon-title">
         </span>
         <div class="crayon-tools" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
          <div class="crayon-button crayon-nums-button" title="切换是否显示行编号">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-plain-button" title="纯文本显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-wrap-button" title="切换自动换行">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-expand-button" title="点击展开代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-copy-button" title="复制代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-popup-button" title="在新窗口中显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <span class="crayon-language">
           Python
          </span>
         </div>
        </div>
        <div class="crayon-info" style="min-height: 16.8px !important; line-height: 16.8px !important;">
        </div>
        <div class="crayon-plain-wrap">
         <textarea class="crayon-plain print-no" data-settings="dblclick" readonly="" style="-moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4; font-size: 12px !important; line-height: 15px !important;" wrap="soft">
          count_by_rating = ratings.countByValue()
x_axis = np.array(count_by_rating.keys())
y_axis = np.array([float(c) for c in count_by_rating.values()])
y_axis_normed = y_axis/y_axis.sum()
pos = np.arange(len(x_axis))
width = 1.0
ax = plt.axes()
ax.set_xticks(pos+(width/2))
ax.set_xticklabels(x_axis)

plt.bar(pos, y_axis_normed, width, color='lightblue')
plt.xticks(rotation=30)
fig = plt.gcf()
fig.set_size_inches(8,5)
         </textarea>
        </div>
        <div class="crayon-main" style="">
         <table class="crayon-table">
          <tbody>
           <tr class="crayon-row">
            <td class="crayon-nums " data-settings="show">
             <div class="crayon-nums-content" style="font-size: 12px !important; line-height: 15px !important;">
              <div class="crayon-num" data-line="crayon-57686696bb5fc668916551-1">
               1
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb5fc668916551-2">
               2
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb5fc668916551-3">
               3
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb5fc668916551-4">
               4
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb5fc668916551-5">
               5
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb5fc668916551-6">
               6
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb5fc668916551-7">
               7
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb5fc668916551-8">
               8
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb5fc668916551-9">
               9
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb5fc668916551-10">
               10
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb5fc668916551-11">
               11
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb5fc668916551-12">
               12
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb5fc668916551-13">
               13
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb5fc668916551-14">
               14
              </div>
             </div>
            </td>
            <td class="crayon-code">
             <div class="crayon-pre" style="font-size: 12px !important; line-height: 15px !important; -moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4;">
              <div class="crayon-line" id="crayon-57686696bb5fc668916551-1">
               <span class="crayon-v">
                count_by_rating
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                ratings
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                countByValue
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb5fc668916551-2">
               <span class="crayon-v">
                x_axis
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                np
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                array
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                count_by_rating
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                keys
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb5fc668916551-3">
               <span class="crayon-v">
                y_axis
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                np
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                array
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-k ">
                float
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                c
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                for
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                c
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                in
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                count_by_rating
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                values
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb5fc668916551-4">
               <span class="crayon-v">
                y_axis_normed
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                y_axis
               </span>
               <span class="crayon-o">
                /
               </span>
               <span class="crayon-v">
                y_axis
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                sum
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb5fc668916551-5">
               <span class="crayon-v">
                pos
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                np
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                arange
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-k ">
                len
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                x_axis
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb5fc668916551-6">
               <span class="crayon-v">
                width
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                1.0
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb5fc668916551-7">
               <span class="crayon-v">
                ax
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                plt
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                axes
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb5fc668916551-8">
               <span class="crayon-v">
                ax
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                set_xticks
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                pos
               </span>
               <span class="crayon-o">
                +
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                width
               </span>
               <span class="crayon-o">
                /
               </span>
               <span class="crayon-cn">
                2
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb5fc668916551-9">
               <span class="crayon-v">
                ax
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                set_xticklabels
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                x_axis
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb5fc668916551-10">
              </div>
              <div class="crayon-line" id="crayon-57686696bb5fc668916551-11">
               <span class="crayon-v">
                plt
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                bar
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                pos
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                y_axis_normed
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                width
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                color
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-s">
                'lightblue'
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb5fc668916551-12">
               <span class="crayon-v">
                plt
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                xticks
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                rotation
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-cn">
                30
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb5fc668916551-13">
               <span class="crayon-v">
                fig
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                plt
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                gcf
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb5fc668916551-14">
               <span class="crayon-v">
                fig
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                set_size_inches
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-cn">
                8
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-cn">
                5
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <!-- [Format Time: 0.0250 seconds] -->
       <p>
       </p>
       <p>
        <img src="http://blogburness1-wordpress.stor.sinaapp.com/uploads/2015/12/NewImage39.png"/>
       </p>
       <p>
        计算每个用户和其对应的评价次数：
       </p>
       <!-- Crayon Syntax Highlighter v_2.7.2_beta -->
       <div class="crayon-syntax crayon-theme-classic crayon-font-monaco crayon-os-pc print-yes notranslate" data-settings=" minimize scroll-mouseover" id="crayon-57686696bb602747540512" style=" margin-top: 12px; margin-bottom: 12px; font-size: 12px !important; line-height: 15px !important;">
        <div class="crayon-toolbar" data-settings=" mouseover overlay hide delay" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
         <span class="crayon-title">
         </span>
         <div class="crayon-tools" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
          <div class="crayon-button crayon-nums-button" title="切换是否显示行编号">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-plain-button" title="纯文本显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-wrap-button" title="切换自动换行">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-expand-button" title="点击展开代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-copy-button" title="复制代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-popup-button" title="在新窗口中显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <span class="crayon-language">
           Python
          </span>
         </div>
        </div>
        <div class="crayon-info" style="min-height: 16.8px !important; line-height: 16.8px !important;">
        </div>
        <div class="crayon-plain-wrap">
         <textarea class="crayon-plain print-no" data-settings="dblclick" readonly="" style="-moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4; font-size: 12px !important; line-height: 15px !important;" wrap="soft">
          user_ratings_grouped = rating_data.map(lambda fields:(int(fields[0]),int(fields[2]))).groupByKey()
user_rating_byuser = user_ratings_grouped.map(lambda (k,v):(k,len(v)))
user_rating_byuser.take(5)
         </textarea>
        </div>
        <div class="crayon-main" style="">
         <table class="crayon-table">
          <tbody>
           <tr class="crayon-row">
            <td class="crayon-nums " data-settings="show">
             <div class="crayon-nums-content" style="font-size: 12px !important; line-height: 15px !important;">
              <div class="crayon-num" data-line="crayon-57686696bb602747540512-1">
               1
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb602747540512-2">
               2
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb602747540512-3">
               3
              </div>
             </div>
            </td>
            <td class="crayon-code">
             <div class="crayon-pre" style="font-size: 12px !important; line-height: 15px !important; -moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4;">
              <div class="crayon-line" id="crayon-57686696bb602747540512-1">
               <span class="crayon-v">
                user_ratings_grouped
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                rating_data
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                fields
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-k ">
                int
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                fields
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-cn">
                0
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-k ">
                int
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                fields
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-cn">
                2
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                groupByKey
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb602747540512-2">
               <span class="crayon-v">
                user_rating_byuser
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                user_ratings_grouped
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                k
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                v
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                k
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-k ">
                len
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                v
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb602747540512-3">
               <span class="crayon-v">
                user_rating_byuser
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                take
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-cn">
                5
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <!-- [Format Time: 0.0193 seconds] -->
       <p>
       </p>
       <p>
        <img src="http://blogburness1-wordpress.stor.sinaapp.com/uploads/2015/12/NewImage40.png"/>
       </p>
       <p>
        计算每个用户的总共评价次数的分布：
       </p>
       <!-- Crayon Syntax Highlighter v_2.7.2_beta -->
       <div class="crayon-syntax crayon-theme-classic crayon-font-monaco crayon-os-pc print-yes notranslate" data-settings=" minimize scroll-mouseover" id="crayon-57686696bb608807869485" style=" margin-top: 12px; margin-bottom: 12px; font-size: 12px !important; line-height: 15px !important;">
        <div class="crayon-toolbar" data-settings=" mouseover overlay hide delay" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
         <span class="crayon-title">
         </span>
         <div class="crayon-tools" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
          <div class="crayon-button crayon-nums-button" title="切换是否显示行编号">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-plain-button" title="纯文本显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-wrap-button" title="切换自动换行">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-expand-button" title="点击展开代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-copy-button" title="复制代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-popup-button" title="在新窗口中显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <span class="crayon-language">
           Python
          </span>
         </div>
        </div>
        <div class="crayon-info" style="min-height: 16.8px !important; line-height: 16.8px !important;">
        </div>
        <div class="crayon-plain-wrap">
         <textarea class="crayon-plain print-no" data-settings="dblclick" readonly="" style="-moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4; font-size: 12px !important; line-height: 15px !important;" wrap="soft">
          user_ratings_byuser_local = user_rating_byuser.map(lambda (k,v):v).collect()
hist(user_ratings_byuser_local, bins=200, color = 'lightblue',normed = True)
fig = plt.gcf()
fig.set_size_inches(8,5)
         </textarea>
        </div>
        <div class="crayon-main" style="">
         <table class="crayon-table">
          <tbody>
           <tr class="crayon-row">
            <td class="crayon-nums " data-settings="show">
             <div class="crayon-nums-content" style="font-size: 12px !important; line-height: 15px !important;">
              <div class="crayon-num" data-line="crayon-57686696bb608807869485-1">
               1
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb608807869485-2">
               2
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb608807869485-3">
               3
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb608807869485-4">
               4
              </div>
             </div>
            </td>
            <td class="crayon-code">
             <div class="crayon-pre" style="font-size: 12px !important; line-height: 15px !important; -moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4;">
              <div class="crayon-line" id="crayon-57686696bb608807869485-1">
               <span class="crayon-v">
                user_ratings_byuser_local
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                user_rating_byuser
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                k
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                v
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-v">
                v
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                collect
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb608807869485-2">
               <span class="crayon-e">
                hist
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                user_ratings_byuser_local
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                bins
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-cn">
                200
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                color
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                'lightblue'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                normed
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-t">
                True
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb608807869485-3">
               <span class="crayon-v">
                fig
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                plt
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                gcf
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb608807869485-4">
               <span class="crayon-v">
                fig
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                set_size_inches
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-cn">
                8
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-cn">
                5
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <!-- [Format Time: 0.0229 seconds] -->
       <p>
       </p>
       <p>
        <img src="http://blogburness1-wordpress.stor.sinaapp.com/uploads/2015/12/NewImage41.png"/>
       </p>
       <p>
        为每部电影计算其被评论数量分布：
       </p>
       <!-- Crayon Syntax Highlighter v_2.7.2_beta -->
       <div class="crayon-syntax crayon-theme-classic crayon-font-monaco crayon-os-pc print-yes notranslate" data-settings=" minimize scroll-mouseover" id="crayon-57686696bb60e586872220" style=" margin-top: 12px; margin-bottom: 12px; font-size: 12px !important; line-height: 15px !important;">
        <div class="crayon-toolbar" data-settings=" mouseover overlay hide delay" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
         <span class="crayon-title">
         </span>
         <div class="crayon-tools" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
          <div class="crayon-button crayon-nums-button" title="切换是否显示行编号">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-plain-button" title="纯文本显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-wrap-button" title="切换自动换行">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-expand-button" title="点击展开代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-copy-button" title="复制代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-popup-button" title="在新窗口中显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <span class="crayon-language">
           Python
          </span>
         </div>
        </div>
        <div class="crayon-info" style="min-height: 16.8px !important; line-height: 16.8px !important;">
        </div>
        <div class="crayon-plain-wrap">
         <textarea class="crayon-plain print-no" data-settings="dblclick" readonly="" style="-moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4; font-size: 12px !important; line-height: 15px !important;" wrap="soft">
          # 为每部电影计算他的被评论的次数的分布
movie_ratings_group = rating_data.map(lambda fields: (int(fields[1]),int(fields[2]))).groupByKey()
movie_ratings_byuser = movie_ratings_group.map(lambda (k,v):(k,len(v)))
movie_ratings_byuser.take(5)
         </textarea>
        </div>
        <div class="crayon-main" style="">
         <table class="crayon-table">
          <tbody>
           <tr class="crayon-row">
            <td class="crayon-nums " data-settings="show">
             <div class="crayon-nums-content" style="font-size: 12px !important; line-height: 15px !important;">
              <div class="crayon-num" data-line="crayon-57686696bb60e586872220-1">
               1
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb60e586872220-2">
               2
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb60e586872220-3">
               3
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb60e586872220-4">
               4
              </div>
             </div>
            </td>
            <td class="crayon-code">
             <div class="crayon-pre" style="font-size: 12px !important; line-height: 15px !important; -moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4;">
              <div class="crayon-line" id="crayon-57686696bb60e586872220-1">
               <span class="crayon-c">
                # 为每部电影计算他的被评论的次数的分布
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb60e586872220-2">
               <span class="crayon-v">
                movie_ratings_group
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                rating_data
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                fields
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-k ">
                int
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                fields
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-cn">
                1
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-k ">
                int
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                fields
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-cn">
                2
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                groupByKey
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb60e586872220-3">
               <span class="crayon-v">
                movie_ratings_byuser
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                movie_ratings_group
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                k
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                v
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                k
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-k ">
                len
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                v
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb60e586872220-4">
               <span class="crayon-v">
                movie_ratings_byuser
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                take
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-cn">
                5
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <!-- [Format Time: 0.0032 seconds] -->
       <p>
       </p>
       <!-- Crayon Syntax Highlighter v_2.7.2_beta -->
       <div class="crayon-syntax crayon-theme-classic crayon-font-monaco crayon-os-pc print-yes notranslate" data-settings=" minimize scroll-mouseover" id="crayon-57686696bb614484742969" style=" margin-top: 12px; margin-bottom: 12px; font-size: 12px !important; line-height: 15px !important;">
        <div class="crayon-toolbar" data-settings=" mouseover overlay hide delay" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
         <span class="crayon-title">
         </span>
         <div class="crayon-tools" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
          <div class="crayon-button crayon-nums-button" title="切换是否显示行编号">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-plain-button" title="纯文本显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-wrap-button" title="切换自动换行">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-expand-button" title="点击展开代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-copy-button" title="复制代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-popup-button" title="在新窗口中显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <span class="crayon-language">
           Python
          </span>
         </div>
        </div>
        <div class="crayon-info" style="min-height: 16.8px !important; line-height: 16.8px !important;">
        </div>
        <div class="crayon-plain-wrap">
         <textarea class="crayon-plain print-no" data-settings="dblclick" readonly="" style="-moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4; font-size: 12px !important; line-height: 15px !important;" wrap="soft">
          movie_ratings_byuser_local = movie_ratings_byuser.map(lambda (k,v):v).collect()
hist(movie_ratings_byuser_local,bins=200,color='lightblue',normed=True)
fig = plt.gcf()
fig.set_size_inches(8,5)
         </textarea>
        </div>
        <div class="crayon-main" style="">
         <table class="crayon-table">
          <tbody>
           <tr class="crayon-row">
            <td class="crayon-nums " data-settings="show">
             <div class="crayon-nums-content" style="font-size: 12px !important; line-height: 15px !important;">
              <div class="crayon-num" data-line="crayon-57686696bb614484742969-1">
               1
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb614484742969-2">
               2
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb614484742969-3">
               3
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb614484742969-4">
               4
              </div>
             </div>
            </td>
            <td class="crayon-code">
             <div class="crayon-pre" style="font-size: 12px !important; line-height: 15px !important; -moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4;">
              <div class="crayon-line" id="crayon-57686696bb614484742969-1">
               <span class="crayon-v">
                movie_ratings_byuser_local
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                movie_ratings_byuser
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                k
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                v
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-v">
                v
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                collect
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb614484742969-2">
               <span class="crayon-e">
                hist
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                movie_ratings_byuser_local
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                bins
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-cn">
                200
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                color
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-s">
                'lightblue'
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                normed
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-t">
                True
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb614484742969-3">
               <span class="crayon-v">
                fig
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                plt
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                gcf
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb614484742969-4">
               <span class="crayon-v">
                fig
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                set_size_inches
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-cn">
                8
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-cn">
                5
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <!-- [Format Time: 0.0064 seconds] -->
       <p>
       </p>
       <p>
        <img src="http://blogburness1-wordpress.stor.sinaapp.com/uploads/2015/12/NewImage42.png"/>
       </p>
       <h2>
        处理与变换数据
       </h2>
       <p>
        主要处理方法：
       </p>
       <ul>
        <li>
         滤除或移除bad values和missing values
        </li>
        <li>
         用给定值来替换bad values和missing values
        </li>
        <li>
         针对异值点使用一些鲁棒性强的技术
        </li>
        <li>
         对潜在异值点进行转换
        </li>
       </ul>
       <h3>
        用指定值替换bad values和missing values
       </h3>
       <p>
       </p>
       <!-- Crayon Syntax Highlighter v_2.7.2_beta -->
       <div class="crayon-syntax crayon-theme-classic crayon-font-monaco crayon-os-pc print-yes notranslate" data-settings=" minimize scroll-mouseover" id="crayon-57686696bb61a195465476" style=" margin-top: 12px; margin-bottom: 12px; font-size: 12px !important; line-height: 15px !important;">
        <div class="crayon-toolbar" data-settings=" mouseover overlay hide delay" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
         <span class="crayon-title">
         </span>
         <div class="crayon-tools" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
          <div class="crayon-button crayon-nums-button" title="切换是否显示行编号">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-plain-button" title="纯文本显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-wrap-button" title="切换自动换行">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-expand-button" title="点击展开代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-copy-button" title="复制代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-popup-button" title="在新窗口中显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <span class="crayon-language">
           Python
          </span>
         </div>
        </div>
        <div class="crayon-info" style="min-height: 16.8px !important; line-height: 16.8px !important;">
        </div>
        <div class="crayon-plain-wrap">
         <textarea class="crayon-plain print-no" data-settings="dblclick" readonly="" style="-moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4; font-size: 12px !important; line-height: 15px !important;" wrap="soft">
          years_pre_processed = movie_fields.map(lambda fields: fields[2]).map(lambda x: convert_year(x)).collect()
years_pre_processed_array = np.array(years_pre_processed)
mean_year = np.mean(years_pre_processed_array[years_pre_processed_array!=1900])
median_year = np.median(years_pre_processed_array[years_pre_processed_array!=1900])
index_bad_data = np.where(years_pre_processed_array==1900)
years_pre_processed_array[index_bad_data] = median_year
print 'Mean year of release: %d' % mean_year
print 'Median year of release: %d ' % median_year
print "Index of '1900' after assigning median: %s"% np.where(years_pre_processed_array==1900)[0]
         </textarea>
        </div>
        <div class="crayon-main" style="">
         <table class="crayon-table">
          <tbody>
           <tr class="crayon-row">
            <td class="crayon-nums " data-settings="show">
             <div class="crayon-nums-content" style="font-size: 12px !important; line-height: 15px !important;">
              <div class="crayon-num" data-line="crayon-57686696bb61a195465476-1">
               1
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb61a195465476-2">
               2
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb61a195465476-3">
               3
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb61a195465476-4">
               4
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb61a195465476-5">
               5
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb61a195465476-6">
               6
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb61a195465476-7">
               7
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb61a195465476-8">
               8
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb61a195465476-9">
               9
              </div>
             </div>
            </td>
            <td class="crayon-code">
             <div class="crayon-pre" style="font-size: 12px !important; line-height: 15px !important; -moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4;">
              <div class="crayon-line" id="crayon-57686696bb61a195465476-1">
               <span class="crayon-v">
                years_pre_processed
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                movie_fields
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                fields
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                fields
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-cn">
                2
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                convert_year
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                collect
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb61a195465476-2">
               <span class="crayon-v">
                years_pre_processed_array
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                np
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                array
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                years_pre_processed
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb61a195465476-3">
               <span class="crayon-v">
                mean_year
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                np
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                mean
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                years_pre_processed_array
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-v">
                years_pre_processed_array
               </span>
               <span class="crayon-o">
                !=
               </span>
               <span class="crayon-cn">
                1900
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb61a195465476-4">
               <span class="crayon-v">
                median_year
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                np
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                median
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                years_pre_processed_array
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-v">
                years_pre_processed_array
               </span>
               <span class="crayon-o">
                !=
               </span>
               <span class="crayon-cn">
                1900
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb61a195465476-5">
               <span class="crayon-v">
                index_bad_data
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                np
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                where
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                years_pre_processed_array
               </span>
               <span class="crayon-o">
                ==
               </span>
               <span class="crayon-cn">
                1900
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb61a195465476-6">
               <span class="crayon-v">
                years_pre_processed_array
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-v">
                index_bad_data
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                median_year
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb61a195465476-7">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                'Mean year of release: %d'
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                %
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                mean_year
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb61a195465476-8">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                'Median year of release: %d '
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                %
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                median_year
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb61a195465476-9">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                "Index of '1900' after assigning median: %s"
               </span>
               <span class="crayon-o">
                %
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                np
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                where
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                years_pre_processed_array
               </span>
               <span class="crayon-o">
                ==
               </span>
               <span class="crayon-cn">
                1900
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-cn">
                0
               </span>
               <span class="crayon-sy">
                ]
               </span>
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <!-- [Format Time: 0.0234 seconds] -->
       <p>
       </p>
       <p>
        用中位数的值来替代哪些bad values
       </p>
       <h3>
        从数据中提取有用特征
       </h3>
       <p>
        特征可以分为多种特征，包括：
       </p>
       <ul>
        <li>
         Numerical features
        </li>
        <li>
         Categorical features,如性别
        </li>
        <li>
         Text features，如标题
        </li>
        <li>
         Other features,如经纬度信息
        </li>
        <li>
         Derived features，如前面的movie age
        </li>
       </ul>
       <p>
       </p>
       <!-- Crayon Syntax Highlighter v_2.7.2_beta -->
       <div class="crayon-syntax crayon-theme-classic crayon-font-monaco crayon-os-pc print-yes notranslate" data-settings=" minimize scroll-mouseover" id="crayon-57686696bb621441676951" style=" margin-top: 12px; margin-bottom: 12px; font-size: 12px !important; line-height: 15px !important;">
        <div class="crayon-toolbar" data-settings=" mouseover overlay hide delay" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
         <span class="crayon-title">
         </span>
         <div class="crayon-tools" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
          <div class="crayon-button crayon-nums-button" title="切换是否显示行编号">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-plain-button" title="纯文本显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-wrap-button" title="切换自动换行">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-expand-button" title="点击展开代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-copy-button" title="复制代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-popup-button" title="在新窗口中显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <span class="crayon-language">
           Python
          </span>
         </div>
        </div>
        <div class="crayon-info" style="min-height: 16.8px !important; line-height: 16.8px !important;">
        </div>
        <div class="crayon-plain-wrap">
         <textarea class="crayon-plain print-no" data-settings="dblclick" readonly="" style="-moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4; font-size: 12px !important; line-height: 15px !important;" wrap="soft">
          all_occupations = user_fields.map(lambda fields:fields[3]).distinct().collect()
all_occupations.sort()
idx = 0
all_occupations_dict = {}
for o in all_occupations:
    all_occupations_dict[o] = idx
    idx +=1
print "Encoding of 'doctor': %d" %all_occupations_dict['doctor']
print "Encoding of 'programmer': %d" % all_occupations_dict['programmer']
         </textarea>
        </div>
        <div class="crayon-main" style="">
         <table class="crayon-table">
          <tbody>
           <tr class="crayon-row">
            <td class="crayon-nums " data-settings="show">
             <div class="crayon-nums-content" style="font-size: 12px !important; line-height: 15px !important;">
              <div class="crayon-num" data-line="crayon-57686696bb621441676951-1">
               1
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb621441676951-2">
               2
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb621441676951-3">
               3
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb621441676951-4">
               4
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb621441676951-5">
               5
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb621441676951-6">
               6
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb621441676951-7">
               7
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb621441676951-8">
               8
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb621441676951-9">
               9
              </div>
             </div>
            </td>
            <td class="crayon-code">
             <div class="crayon-pre" style="font-size: 12px !important; line-height: 15px !important; -moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4;">
              <div class="crayon-line" id="crayon-57686696bb621441676951-1">
               <span class="crayon-v">
                all_occupations
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                user_fields
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                fields
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-v">
                fields
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-cn">
                3
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                distinct
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                collect
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb621441676951-2">
               <span class="crayon-v">
                all_occupations
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                sort
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb621441676951-3">
               <span class="crayon-v">
                idx
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                0
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb621441676951-4">
               <span class="crayon-v">
                all_occupations_dict
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                {
               </span>
               <span class="crayon-sy">
                }
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb621441676951-5">
               <span class="crayon-st">
                for
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                o
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                in
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                all_occupations
               </span>
               <span class="crayon-o">
                :
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb621441676951-6">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                all_occupations_dict
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-v">
                o
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                idx
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb621441676951-7">
               <span class="crayon-e">
               </span>
               <span class="crayon-v">
                idx
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                +=
               </span>
               <span class="crayon-cn">
                1
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb621441676951-8">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                "Encoding of 'doctor': %d"
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                %
               </span>
               <span class="crayon-v">
                all_occupations_dict
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-s">
                'doctor'
               </span>
               <span class="crayon-sy">
                ]
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb621441676951-9">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                "Encoding of 'programmer': %d"
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                %
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                all_occupations_dict
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-s">
                'programmer'
               </span>
               <span class="crayon-sy">
                ]
               </span>
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <!-- [Format Time: 0.0171 seconds] -->
       <p>
       </p>
       <p class="”prettyprint”">
        上面将categorical features转换到数值型的，但是经常我们在做数据处理的时候，这类彼此之间没有潜在排序信息的数据，应该进行dummies处理：
       </p>
       <p>
       </p>
       <!-- Crayon Syntax Highlighter v_2.7.2_beta -->
       <div class="crayon-syntax crayon-theme-classic crayon-font-monaco crayon-os-pc print-yes notranslate" data-settings=" minimize scroll-mouseover" id="crayon-57686696bb628066991141" style=" margin-top: 12px; margin-bottom: 12px; font-size: 12px !important; line-height: 15px !important;">
        <div class="crayon-toolbar" data-settings=" mouseover overlay hide delay" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
         <span class="crayon-title">
         </span>
         <div class="crayon-tools" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
          <div class="crayon-button crayon-nums-button" title="切换是否显示行编号">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-plain-button" title="纯文本显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-wrap-button" title="切换自动换行">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-expand-button" title="点击展开代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-copy-button" title="复制代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-popup-button" title="在新窗口中显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <span class="crayon-language">
           Python
          </span>
         </div>
        </div>
        <div class="crayon-info" style="min-height: 16.8px !important; line-height: 16.8px !important;">
        </div>
        <div class="crayon-plain-wrap">
         <textarea class="crayon-plain print-no" data-settings="dblclick" readonly="" style="-moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4; font-size: 12px !important; line-height: 15px !important;" wrap="soft">
          K=len(all_occupations_dict)
binary_x = np.zeros(K)
k_programmer = all_occupations_dict['programmer']
binary_x[k_programmer] = 1
print 'Binary feature vector: %s'%binary_x
print 'Length of binray vector: %d' %K
         </textarea>
        </div>
        <div class="crayon-main" style="">
         <table class="crayon-table">
          <tbody>
           <tr class="crayon-row">
            <td class="crayon-nums " data-settings="show">
             <div class="crayon-nums-content" style="font-size: 12px !important; line-height: 15px !important;">
              <div class="crayon-num" data-line="crayon-57686696bb628066991141-1">
               1
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb628066991141-2">
               2
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb628066991141-3">
               3
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb628066991141-4">
               4
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb628066991141-5">
               5
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb628066991141-6">
               6
              </div>
             </div>
            </td>
            <td class="crayon-code">
             <div class="crayon-pre" style="font-size: 12px !important; line-height: 15px !important; -moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4;">
              <div class="crayon-line" id="crayon-57686696bb628066991141-1">
               <span class="crayon-v">
                K
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-k ">
                len
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                all_occupations_dict
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb628066991141-2">
               <span class="crayon-v">
                binary_x
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                np
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                zeros
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                K
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb628066991141-3">
               <span class="crayon-v">
                k_programmer
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                all_occupations_dict
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-s">
                'programmer'
               </span>
               <span class="crayon-sy">
                ]
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb628066991141-4">
               <span class="crayon-v">
                binary_x
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-v">
                k_programmer
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                1
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb628066991141-5">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                'Binary feature vector: %s'
               </span>
               <span class="crayon-o">
                %
               </span>
               <span class="crayon-e">
                binary_x
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb628066991141-6">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                'Length of binray vector: %d'
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                %
               </span>
               <span class="crayon-v">
                K
               </span>
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <!-- [Format Time: 0.0024 seconds] -->
       <p>
       </p>
       <div class="crayon-syntax crayon-theme-classic crayon-font-monaco crayon-os-pc print-yes notranslate" data-settings=" minimize scroll-mouseover" id="crayon-57686696bb62e128605254" style=" margin-top: 12px; margin-bottom: 12px; font-size: 12px !important; line-height: 15px !important;">
        <div class="crayon-toolbar" data-settings=" mouseover overlay hide delay" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
         <span class="crayon-title">
         </span>
         <div class="crayon-tools" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
          <div class="crayon-button crayon-nums-button" title="切换是否显示行编号">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-plain-button" title="纯文本显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-wrap-button" title="切换自动换行">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-expand-button" title="点击展开代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-copy-button" title="复制代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-popup-button" title="在新窗口中显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
         </div>
        </div>
        <div class="crayon-info" style="min-height: 16.8px !important; line-height: 16.8px !important;">
        </div>
        <div class="crayon-plain-wrap">
         <textarea class="crayon-plain print-no" data-settings="dblclick" readonly="" style="-moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4; font-size: 12px !important; line-height: 15px !important;" wrap="soft">
         </textarea>
        </div>
        <div class="crayon-main" style="">
         <table class="crayon-table">
          <tbody>
           <tr class="crayon-row">
            <td class="crayon-nums " data-settings="show">
             <div class="crayon-nums-content" style="font-size: 12px !important; line-height: 15px !important;">
              <div class="crayon-num" data-line="crayon-57686696bb62e128605254-1">
               1
              </div>
             </div>
            </td>
            <td class="crayon-code">
             <div class="crayon-pre" style="font-size: 12px !important; line-height: 15px !important; -moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4;">
              <div class="crayon-line" id="crayon-57686696bb62e128605254-1">
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <p>
       </p>
       <p>
        特征值做dummies处理后，得到的二值化的特征：
       </p>
       <p>
        <img src="http://blogburness1-wordpress.stor.sinaapp.com/uploads/2015/12/NewImage59.png"/>
       </p>
       <p>
        时间戳转为categorical feature
       </p>
       <p>
       </p>
       <!-- Crayon Syntax Highlighter v_2.7.2_beta -->
       <div class="crayon-syntax crayon-theme-classic crayon-font-monaco crayon-os-pc print-yes notranslate" data-settings=" minimize scroll-mouseover" id="crayon-57686696bb633501234163" style=" margin-top: 12px; margin-bottom: 12px; font-size: 12px !important; line-height: 15px !important;">
        <div class="crayon-toolbar" data-settings=" mouseover overlay hide delay" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
         <span class="crayon-title">
         </span>
         <div class="crayon-tools" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
          <div class="crayon-button crayon-nums-button" title="切换是否显示行编号">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-plain-button" title="纯文本显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-wrap-button" title="切换自动换行">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-expand-button" title="点击展开代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-copy-button" title="复制代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-popup-button" title="在新窗口中显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <span class="crayon-language">
           Python
          </span>
         </div>
        </div>
        <div class="crayon-info" style="min-height: 16.8px !important; line-height: 16.8px !important;">
        </div>
        <div class="crayon-plain-wrap">
         <textarea class="crayon-plain print-no" data-settings="dblclick" readonly="" style="-moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4; font-size: 12px !important; line-height: 15px !important;" wrap="soft">
          def extract_datetime(ts):
    import datetime
    return datetime.datetime.fromtimestamp(ts)
timestamps = rating_data.map(lambda fields:int(fields[3]))
hour_of_day = timestamps.map(lambda ts: extract_datetime(ts).hour)
hour_of_day.take(5)
         </textarea>
        </div>
        <div class="crayon-main" style="">
         <table class="crayon-table">
          <tbody>
           <tr class="crayon-row">
            <td class="crayon-nums " data-settings="show">
             <div class="crayon-nums-content" style="font-size: 12px !important; line-height: 15px !important;">
              <div class="crayon-num" data-line="crayon-57686696bb633501234163-1">
               1
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb633501234163-2">
               2
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb633501234163-3">
               3
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb633501234163-4">
               4
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb633501234163-5">
               5
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb633501234163-6">
               6
              </div>
             </div>
            </td>
            <td class="crayon-code">
             <div class="crayon-pre" style="font-size: 12px !important; line-height: 15px !important; -moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4;">
              <div class="crayon-line" id="crayon-57686696bb633501234163-1">
               <span class="crayon-r">
                def
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                extract_datetime
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                ts
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-o">
                :
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb633501234163-2">
               <span class="crayon-h">
               </span>
               <span class="crayon-r">
                import
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-k ">
                datetime
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb633501234163-3">
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                return
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-k ">
                datetime
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                datetime
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                fromtimestamp
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                ts
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb633501234163-4">
               <span class="crayon-v">
                timestamps
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                rating_data
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                fields
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-k ">
                int
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                fields
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-cn">
                3
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb633501234163-5">
               <span class="crayon-v">
                hour_of_day
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                timestamps
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                ts
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                extract_datetime
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                ts
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                hour
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb633501234163-6">
               <span class="crayon-v">
                hour_of_day
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                take
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-cn">
                5
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <!-- [Format Time: 0.0103 seconds] -->
       <p>
       </p>
       <p>
        <img src="http://blogburness1-wordpress.stor.sinaapp.com/uploads/2015/12/NewImage60.png"/>
       </p>
       <p class="”prettyprint”">
        按时间段划分为morning,lunch, afternoon, evening, night（下面有原书代码是错误的 ’night’:[23,7]）:
       </p>
       <p>
       </p>
       <!-- Crayon Syntax Highlighter v_2.7.2_beta -->
       <div class="crayon-syntax crayon-theme-classic crayon-font-monaco crayon-os-pc print-yes notranslate" data-settings=" minimize scroll-mouseover" id="crayon-57686696bb63a583577555" style=" margin-top: 12px; margin-bottom: 12px; font-size: 12px !important; line-height: 15px !important;">
        <div class="crayon-toolbar" data-settings=" mouseover overlay hide delay" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
         <span class="crayon-title">
         </span>
         <div class="crayon-tools" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
          <div class="crayon-button crayon-nums-button" title="切换是否显示行编号">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-plain-button" title="纯文本显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-wrap-button" title="切换自动换行">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-expand-button" title="点击展开代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-copy-button" title="复制代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-popup-button" title="在新窗口中显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <span class="crayon-language">
           Python
          </span>
         </div>
        </div>
        <div class="crayon-info" style="min-height: 16.8px !important; line-height: 16.8px !important;">
        </div>
        <div class="crayon-plain-wrap">
         <textarea class="crayon-plain print-no" data-settings="dblclick" readonly="" style="-moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4; font-size: 12px !important; line-height: 15px !important;" wrap="soft">
          def assign_tod(hr):
    times_of_day = {
        'morning':range(7,12),
        'lunch': range(12,14),
        'afternoon':range(14,18),
        'evening':range(18,23),
        'night': [23,24,1,2,3,4,5,6]
        }
    for k,v in times_of_day.iteritems():
        if hr in v:
            return k
         </textarea>
        </div>
        <div class="crayon-main" style="">
         <table class="crayon-table">
          <tbody>
           <tr class="crayon-row">
            <td class="crayon-nums " data-settings="show">
             <div class="crayon-nums-content" style="font-size: 12px !important; line-height: 15px !important;">
              <div class="crayon-num" data-line="crayon-57686696bb63a583577555-1">
               1
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb63a583577555-2">
               2
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb63a583577555-3">
               3
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb63a583577555-4">
               4
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb63a583577555-5">
               5
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb63a583577555-6">
               6
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb63a583577555-7">
               7
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb63a583577555-8">
               8
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb63a583577555-9">
               9
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb63a583577555-10">
               10
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb63a583577555-11">
               11
              </div>
             </div>
            </td>
            <td class="crayon-code">
             <div class="crayon-pre" style="font-size: 12px !important; line-height: 15px !important; -moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4;">
              <div class="crayon-line" id="crayon-57686696bb63a583577555-1">
               <span class="crayon-r">
                def
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                assign_tod
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                hr
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-o">
                :
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb63a583577555-2">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                times_of_day
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                {
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb63a583577555-3">
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                'morning'
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-k ">
                range
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-cn">
                7
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-cn">
                12
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                ,
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb63a583577555-4">
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                'lunch'
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-k ">
                range
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-cn">
                12
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-cn">
                14
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                ,
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb63a583577555-5">
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                'afternoon'
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-k ">
                range
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-cn">
                14
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-cn">
                18
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                ,
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb63a583577555-6">
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                'evening'
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-k ">
                range
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-cn">
                18
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-cn">
                23
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                ,
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb63a583577555-7">
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                'night'
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-cn">
                23
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-cn">
                24
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-cn">
                1
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-cn">
                2
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-cn">
                3
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-cn">
                4
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-cn">
                5
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-cn">
                6
               </span>
               <span class="crayon-sy">
                ]
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb63a583577555-8">
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                }
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb63a583577555-9">
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                for
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                k
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-i">
                v
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                in
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                times_of_day
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                iteritems
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-o">
                :
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb63a583577555-10">
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                if
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                hr
               </span>
               <span class="crayon-st">
                in
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                v
               </span>
               <span class="crayon-o">
                :
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb63a583577555-11">
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                return
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                k
               </span>
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <!-- [Format Time: 0.0171 seconds] -->
       <p>
       </p>
       <p class="”prettyprint”">
        <img src="http://blogburness1-wordpress.stor.sinaapp.com/uploads/2015/12/NewImage61.png"/>
       </p>
       <p class="”prettyprint”">
        然后对这些时间段做dummies处理，编码成[0,0,0,0,1]，操作类似于原来的职业统计处理的时候：
       </p>
       <p>
       </p>
       <!-- Crayon Syntax Highlighter v_2.7.2_beta -->
       <div class="crayon-syntax crayon-theme-classic crayon-font-monaco crayon-os-pc print-yes notranslate" data-settings=" minimize scroll-mouseover" id="crayon-57686696bb640602319257" style=" margin-top: 12px; margin-bottom: 12px; font-size: 12px !important; line-height: 15px !important;">
        <div class="crayon-toolbar" data-settings=" mouseover overlay hide delay" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
         <span class="crayon-title">
         </span>
         <div class="crayon-tools" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
          <div class="crayon-button crayon-nums-button" title="切换是否显示行编号">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-plain-button" title="纯文本显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-wrap-button" title="切换自动换行">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-expand-button" title="点击展开代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-copy-button" title="复制代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-popup-button" title="在新窗口中显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <span class="crayon-language">
           Python
          </span>
         </div>
        </div>
        <div class="crayon-info" style="min-height: 16.8px !important; line-height: 16.8px !important;">
        </div>
        <div class="crayon-plain-wrap">
         <textarea class="crayon-plain print-no" data-settings="dblclick" readonly="" style="-moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4; font-size: 12px !important; line-height: 15px !important;" wrap="soft">
          time_of_day_unique = time_of_day.map(lambda fields:fields).distinct().collect()
time_of_day_unique.sort()
idx = 0
time_of_day_unique_dict = {}
for o in time_of_day_unique:
    time_of_day_unique_dict[o] = idx
    idx +=1
print "Encoding of 'afternoon': %d" %time_of_day_unique_dict['afternoon']
print "Encoding of 'morning': %d" % time_of_day_unique_dict['morning']
print "Encoding of 'lunch': %d" % time_of_day_unique_dict['lunch']
         </textarea>
        </div>
        <div class="crayon-main" style="">
         <table class="crayon-table">
          <tbody>
           <tr class="crayon-row">
            <td class="crayon-nums " data-settings="show">
             <div class="crayon-nums-content" style="font-size: 12px !important; line-height: 15px !important;">
              <div class="crayon-num" data-line="crayon-57686696bb640602319257-1">
               1
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb640602319257-2">
               2
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb640602319257-3">
               3
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb640602319257-4">
               4
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb640602319257-5">
               5
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb640602319257-6">
               6
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb640602319257-7">
               7
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb640602319257-8">
               8
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb640602319257-9">
               9
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb640602319257-10">
               10
              </div>
             </div>
            </td>
            <td class="crayon-code">
             <div class="crayon-pre" style="font-size: 12px !important; line-height: 15px !important; -moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4;">
              <div class="crayon-line" id="crayon-57686696bb640602319257-1">
               <span class="crayon-v">
                time_of_day_unique
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                time_of_day
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                fields
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-v">
                fields
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                distinct
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                collect
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb640602319257-2">
               <span class="crayon-v">
                time_of_day_unique
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                sort
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb640602319257-3">
               <span class="crayon-v">
                idx
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                0
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb640602319257-4">
               <span class="crayon-v">
                time_of_day_unique_dict
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                {
               </span>
               <span class="crayon-sy">
                }
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb640602319257-5">
               <span class="crayon-st">
                for
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                o
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                in
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                time_of_day_unique
               </span>
               <span class="crayon-o">
                :
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb640602319257-6">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                time_of_day_unique_dict
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-v">
                o
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                idx
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb640602319257-7">
               <span class="crayon-e">
               </span>
               <span class="crayon-v">
                idx
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                +=
               </span>
               <span class="crayon-cn">
                1
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb640602319257-8">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                "Encoding of 'afternoon': %d"
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                %
               </span>
               <span class="crayon-v">
                time_of_day_unique_dict
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-s">
                'afternoon'
               </span>
               <span class="crayon-sy">
                ]
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb640602319257-9">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                "Encoding of 'morning': %d"
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                %
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                time_of_day_unique_dict
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-s">
                'morning'
               </span>
               <span class="crayon-sy">
                ]
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb640602319257-10">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                "Encoding of 'lunch': %d"
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                %
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                time_of_day_unique_dict
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-s">
                'lunch'
               </span>
               <span class="crayon-sy">
                ]
               </span>
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <!-- [Format Time: 0.0132 seconds] -->
       <p>
       </p>
       <p class="”prettyprint”">
        <img src="http://blogburness1-wordpress.stor.sinaapp.com/uploads/2015/12/NewImage62.png"/>
       </p>
       <p class="”prettyprint”">
        文本特征处理基本步骤：
       </p>
       <ul>
        <li>
         Tokenization
        </li>
        <li>
         Stop word removal
        </li>
        <li>
         Stemming
        </li>
        <li>
         Vectorization
        </li>
       </ul>
       <p>
        简单的文本特征提取:
       </p>
       <p>
        1,提取出titles
       </p>
       <!-- Crayon Syntax Highlighter v_2.7.2_beta -->
       <div class="crayon-syntax crayon-theme-classic crayon-font-monaco crayon-os-pc print-yes notranslate" data-settings=" minimize scroll-mouseover" id="crayon-57686696bb647569377980" style=" margin-top: 12px; margin-bottom: 12px; font-size: 12px !important; line-height: 15px !important;">
        <div class="crayon-toolbar" data-settings=" mouseover overlay hide delay" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
         <span class="crayon-title">
         </span>
         <div class="crayon-tools" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
          <span class="crayon-mixed-highlight" title="含多种语言">
          </span>
          <div class="crayon-button crayon-nums-button" title="切换是否显示行编号">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-plain-button" title="纯文本显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-wrap-button" title="切换自动换行">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-expand-button" title="点击展开代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-copy-button" title="复制代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-popup-button" title="在新窗口中显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <span class="crayon-language">
           Python
          </span>
         </div>
        </div>
        <div class="crayon-info" style="min-height: 16.8px !important; line-height: 16.8px !important;">
        </div>
        <div class="crayon-plain-wrap">
         <textarea class="crayon-plain print-no" data-settings="dblclick" readonly="" style="-moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4; font-size: 12px !important; line-height: 15px !important;" wrap="soft">
          def extract_title(raw):
    import re
    grps = re.search("\((\w+)\)",raw)
    if grps:
        return raw[:grps.start()].strip()
    else:
        return raw
raw_titles = movie_fields.map(lambda fields: fields[1])
for raw_title in raw_titles.take(5):
    print extract_title(raw_title)
         </textarea>
        </div>
        <div class="crayon-main" style="">
         <table class="crayon-table">
          <tbody>
           <tr class="crayon-row">
            <td class="crayon-nums " data-settings="show">
             <div class="crayon-nums-content" style="font-size: 12px !important; line-height: 15px !important;">
              <div class="crayon-num" data-line="crayon-57686696bb647569377980-1">
               1
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb647569377980-2">
               2
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb647569377980-3">
               3
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb647569377980-4">
               4
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb647569377980-5">
               5
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb647569377980-6">
               6
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb647569377980-7">
               7
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb647569377980-8">
               8
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb647569377980-9">
               9
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb647569377980-10">
               10
              </div>
             </div>
            </td>
            <td class="crayon-code">
             <div class="crayon-pre" style="font-size: 12px !important; line-height: 15px !important; -moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4;">
              <div class="crayon-line" id="crayon-57686696bb647569377980-1">
               <span class="crayon-r">
                def
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                extract_title
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                raw
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-o">
                :
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb647569377980-2">
               <span class="crayon-h">
               </span>
               <span class="crayon-r">
                import
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-k ">
                re
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb647569377980-3">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                grps
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-k ">
                re
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                search
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-s">
                "
                <span class="crayon-sy">
                 \
                </span>
                <span class="crayon-sy">
                 (
                </span>
                <span class="crayon-sy">
                 (
                </span>
                <span class="crayon-sy">
                 \
                </span>
                <span class="crayon-v">
                 w
                </span>
                <span class="crayon-o">
                 +
                </span>
                <span class="crayon-sy">
                 )
                </span>
                \)"
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                raw
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb647569377980-4">
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                if
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                grps
               </span>
               <span class="crayon-o">
                :
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb647569377980-5">
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                return
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                raw
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-v">
                grps
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                start
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                strip
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb647569377980-6">
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                else
               </span>
               <span class="crayon-o">
                :
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb647569377980-7">
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                return
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                raw
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb647569377980-8">
               <span class="crayon-v">
                raw_titles
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                movie_fields
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                fields
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                fields
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-cn">
                1
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb647569377980-9">
               <span class="crayon-st">
                for
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                raw_title
               </span>
               <span class="crayon-st">
                in
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                raw_titles
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                take
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-cn">
                5
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-o">
                :
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb647569377980-10">
               <span class="crayon-h">
               </span>
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                extract_title
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                raw_title
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <!-- [Format Time: 0.0371 seconds] -->
       <p>
       </p>
       <p class="”prettyprint”">
        2，分词处理（汉语麻烦，还好这里是英语，用空格就可以了）
       </p>
       <p>
       </p>
       <!-- Crayon Syntax Highlighter v_2.7.2_beta -->
       <div class="crayon-syntax crayon-theme-classic crayon-font-monaco crayon-os-pc print-yes notranslate" data-settings=" minimize scroll-mouseover" id="crayon-57686696bb64d769786341" style=" margin-top: 12px; margin-bottom: 12px; font-size: 12px !important; line-height: 15px !important;">
        <div class="crayon-toolbar" data-settings=" mouseover overlay hide delay" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
         <span class="crayon-title">
         </span>
         <div class="crayon-tools" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
          <div class="crayon-button crayon-nums-button" title="切换是否显示行编号">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-plain-button" title="纯文本显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-wrap-button" title="切换自动换行">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-expand-button" title="点击展开代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-copy-button" title="复制代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-popup-button" title="在新窗口中显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <span class="crayon-language">
           Python
          </span>
         </div>
        </div>
        <div class="crayon-info" style="min-height: 16.8px !important; line-height: 16.8px !important;">
        </div>
        <div class="crayon-plain-wrap">
         <textarea class="crayon-plain print-no" data-settings="dblclick" readonly="" style="-moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4; font-size: 12px !important; line-height: 15px !important;" wrap="soft">
          movie_titles = raw_titles.map(lambda m: extract_title(m))
title_terms = movie_titles.map(lambda m:m.split(' '))
print title_terms.take(5)
         </textarea>
        </div>
        <div class="crayon-main" style="">
         <table class="crayon-table">
          <tbody>
           <tr class="crayon-row">
            <td class="crayon-nums " data-settings="show">
             <div class="crayon-nums-content" style="font-size: 12px !important; line-height: 15px !important;">
              <div class="crayon-num" data-line="crayon-57686696bb64d769786341-1">
               1
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb64d769786341-2">
               2
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb64d769786341-3">
               3
              </div>
             </div>
            </td>
            <td class="crayon-code">
             <div class="crayon-pre" style="font-size: 12px !important; line-height: 15px !important; -moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4;">
              <div class="crayon-line" id="crayon-57686696bb64d769786341-1">
               <span class="crayon-v">
                movie_titles
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                raw_titles
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                m
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                extract_title
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                m
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb64d769786341-2">
               <span class="crayon-v">
                title_terms
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                movie_titles
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                m
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-v">
                m
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                split
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-s">
                ' '
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb64d769786341-3">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                title_terms
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                take
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-cn">
                5
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <!-- [Format Time: 0.0207 seconds] -->
       <p>
       </p>
       <p class="”prettyprint”">
        然后将所有titles出现的word去重，然后就可以看到所有的word的list:
       </p>
       <p>
       </p>
       <!-- Crayon Syntax Highlighter v_2.7.2_beta -->
       <div class="crayon-syntax crayon-theme-classic crayon-font-monaco crayon-os-pc print-yes notranslate" data-settings=" minimize scroll-mouseover" id="crayon-57686696bb653577731591" style=" margin-top: 12px; margin-bottom: 12px; font-size: 12px !important; line-height: 15px !important;">
        <div class="crayon-toolbar" data-settings=" mouseover overlay hide delay" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
         <span class="crayon-title">
         </span>
         <div class="crayon-tools" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
          <div class="crayon-button crayon-nums-button" title="切换是否显示行编号">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-plain-button" title="纯文本显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-wrap-button" title="切换自动换行">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-expand-button" title="点击展开代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-copy-button" title="复制代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-popup-button" title="在新窗口中显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <span class="crayon-language">
           Python
          </span>
         </div>
        </div>
        <div class="crayon-info" style="min-height: 16.8px !important; line-height: 16.8px !important;">
        </div>
        <div class="crayon-plain-wrap">
         <textarea class="crayon-plain print-no" data-settings="dblclick" readonly="" style="-moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4; font-size: 12px !important; line-height: 15px !important;" wrap="soft">
          all_terms = title_terms.flatMap(lambda x: x).distinct().collect()
idx = 0
all_terms_dict = {}
for term in all_terms:
    all_terms_dict[term] = idx
    idx+=1
    
print "Total number of terms: %d" % len(all_terms_dict)
print "Index of term 'Dead': %d" % all_terms_dict['Dead']
print "Index of term 'Rooms': %d" % all_terms_dict['Rooms']
         </textarea>
        </div>
        <div class="crayon-main" style="">
         <table class="crayon-table">
          <tbody>
           <tr class="crayon-row">
            <td class="crayon-nums " data-settings="show">
             <div class="crayon-nums-content" style="font-size: 12px !important; line-height: 15px !important;">
              <div class="crayon-num" data-line="crayon-57686696bb653577731591-1">
               1
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb653577731591-2">
               2
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb653577731591-3">
               3
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb653577731591-4">
               4
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb653577731591-5">
               5
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb653577731591-6">
               6
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb653577731591-7">
               7
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb653577731591-8">
               8
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb653577731591-9">
               9
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb653577731591-10">
               10
              </div>
             </div>
            </td>
            <td class="crayon-code">
             <div class="crayon-pre" style="font-size: 12px !important; line-height: 15px !important; -moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4;">
              <div class="crayon-line" id="crayon-57686696bb653577731591-1">
               <span class="crayon-v">
                all_terms
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                title_terms
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                flatMap
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                distinct
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                collect
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb653577731591-2">
               <span class="crayon-v">
                idx
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                0
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb653577731591-3">
               <span class="crayon-v">
                all_terms_dict
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-sy">
                {
               </span>
               <span class="crayon-sy">
                }
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb653577731591-4">
               <span class="crayon-st">
                for
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                term
               </span>
               <span class="crayon-st">
                in
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                all_terms
               </span>
               <span class="crayon-o">
                :
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb653577731591-5">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                all_terms_dict
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-v">
                term
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                idx
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb653577731591-6">
               <span class="crayon-e">
               </span>
               <span class="crayon-v">
                idx
               </span>
               <span class="crayon-o">
                +=
               </span>
               <span class="crayon-cn">
                1
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb653577731591-7">
               <span class="crayon-h">
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb653577731591-8">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                "Total number of terms: %d"
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                %
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-k ">
                len
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                all_terms_dict
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb653577731591-9">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                "Index of term 'Dead': %d"
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                %
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                all_terms_dict
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-s">
                'Dead'
               </span>
               <span class="crayon-sy">
                ]
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb653577731591-10">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                "Index of term 'Rooms': %d"
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                %
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                all_terms_dict
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-s">
                'Rooms'
               </span>
               <span class="crayon-sy">
                ]
               </span>
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <!-- [Format Time: 0.0213 seconds] -->
       <p>
       </p>
       <p class="”prettyprint”">
        <img src="http://blogburness1-wordpress.stor.sinaapp.com/uploads/2015/12/NewImage63.png"/>
       </p>
       <p class="”prettyprint”">
        上面的可以用Spark内置的zipWithIndex来完成，zipWithIndex的使用：
       </p>
       <p class="”prettyprint”">
        <img src="http://blogburness1-wordpress.stor.sinaapp.com/uploads/2015/12/NewImage64.png"/>
       </p>
       <p>
       </p>
       <!-- Crayon Syntax Highlighter v_2.7.2_beta -->
       <div class="crayon-syntax crayon-theme-classic crayon-font-monaco crayon-os-pc print-yes notranslate" data-settings=" minimize scroll-mouseover" id="crayon-57686696bb659636782766" style=" margin-top: 12px; margin-bottom: 12px; font-size: 12px !important; line-height: 15px !important;">
        <div class="crayon-toolbar" data-settings=" mouseover overlay hide delay" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
         <span class="crayon-title">
         </span>
         <div class="crayon-tools" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
          <div class="crayon-button crayon-nums-button" title="切换是否显示行编号">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-plain-button" title="纯文本显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-wrap-button" title="切换自动换行">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-expand-button" title="点击展开代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-copy-button" title="复制代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-popup-button" title="在新窗口中显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <span class="crayon-language">
           Python
          </span>
         </div>
        </div>
        <div class="crayon-info" style="min-height: 16.8px !important; line-height: 16.8px !important;">
        </div>
        <div class="crayon-plain-wrap">
         <textarea class="crayon-plain print-no" data-settings="dblclick" readonly="" style="-moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4; font-size: 12px !important; line-height: 15px !important;" wrap="soft">
          all_terms_dict2 = title_terms.flatMap(lambda x:x).distinct().zipWithIndex().collectAsMap()
print "Index of term 'Dead %d" % all_terms_dict['Dead']
print "Index of term 'Rooms': %d" % all_terms_dict['Rooms']
         </textarea>
        </div>
        <div class="crayon-main" style="">
         <table class="crayon-table">
          <tbody>
           <tr class="crayon-row">
            <td class="crayon-nums " data-settings="show">
             <div class="crayon-nums-content" style="font-size: 12px !important; line-height: 15px !important;">
              <div class="crayon-num" data-line="crayon-57686696bb659636782766-1">
               1
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb659636782766-2">
               2
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb659636782766-3">
               3
              </div>
             </div>
            </td>
            <td class="crayon-code">
             <div class="crayon-pre" style="font-size: 12px !important; line-height: 15px !important; -moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4;">
              <div class="crayon-line" id="crayon-57686696bb659636782766-1">
               <span class="crayon-v">
                all_terms_dict2
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                title_terms
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                flatMap
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                distinct
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                zipWithIndex
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                collectAsMap
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb659636782766-2">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                "Index of term 'Dead %d"
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                %
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                all_terms_dict
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-s">
                'Dead'
               </span>
               <span class="crayon-sy">
                ]
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb659636782766-3">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                "Index of term 'Rooms': %d"
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                %
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                all_terms_dict
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-s">
                'Rooms'
               </span>
               <span class="crayon-sy">
                ]
               </span>
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <!-- [Format Time: 0.0120 seconds] -->
       <p>
       </p>
       <p>
        结果与上个版本一致。
       </p>
       <p>
        到了这里，我们就要想着如何把这些数据存储下来，如何使用，如果按前面对categorical var的处理方式，做dummies处理直接存储，显然会浪费太多的空间，我们在这里采用压缩稀疏(csc_matrix)的存储方式。
       </p>
       <!-- Crayon Syntax Highlighter v_2.7.2_beta -->
       <div class="crayon-syntax crayon-theme-classic crayon-font-monaco crayon-os-pc print-yes notranslate" data-settings=" minimize scroll-mouseover" id="crayon-57686696bb65f035975411" style=" margin-top: 12px; margin-bottom: 12px; font-size: 12px !important; line-height: 15px !important;">
        <div class="crayon-toolbar" data-settings=" mouseover overlay hide delay" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
         <span class="crayon-title">
         </span>
         <div class="crayon-tools" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
          <div class="crayon-button crayon-nums-button" title="切换是否显示行编号">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-plain-button" title="纯文本显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-wrap-button" title="切换自动换行">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-expand-button" title="点击展开代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-copy-button" title="复制代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-popup-button" title="在新窗口中显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <span class="crayon-language">
           Python
          </span>
         </div>
        </div>
        <div class="crayon-info" style="min-height: 16.8px !important; line-height: 16.8px !important;">
        </div>
        <div class="crayon-plain-wrap">
         <textarea class="crayon-plain print-no" data-settings="dblclick" readonly="" style="-moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4; font-size: 12px !important; line-height: 15px !important;" wrap="soft">
          def create_vector(terms, term_dict):
    from scipy import sparse as sp
    num_terms = len(term_dict)
    x = sp.csc_matrix((1,num_terms))
    for t in terms:
        if t in term_dict:
            idx = term_dict[t]
            x[0,idx] = 1
    return x
all_terms_bcast = sc.broadcast(all_terms_dict)
term_vectors = title_terms.map(lambda terms: create_vector(terms,all_terms_bcast.value))
term_vectors.take(5)
         </textarea>
        </div>
        <div class="crayon-main" style="">
         <table class="crayon-table">
          <tbody>
           <tr class="crayon-row">
            <td class="crayon-nums " data-settings="show">
             <div class="crayon-nums-content" style="font-size: 12px !important; line-height: 15px !important;">
              <div class="crayon-num" data-line="crayon-57686696bb65f035975411-1">
               1
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb65f035975411-2">
               2
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb65f035975411-3">
               3
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb65f035975411-4">
               4
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb65f035975411-5">
               5
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb65f035975411-6">
               6
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb65f035975411-7">
               7
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb65f035975411-8">
               8
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb65f035975411-9">
               9
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb65f035975411-10">
               10
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb65f035975411-11">
               11
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb65f035975411-12">
               12
              </div>
             </div>
            </td>
            <td class="crayon-code">
             <div class="crayon-pre" style="font-size: 12px !important; line-height: 15px !important; -moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4;">
              <div class="crayon-line" id="crayon-57686696bb65f035975411-1">
               <span class="crayon-r">
                def
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                create_vector
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                terms
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                term_dict
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-o">
                :
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb65f035975411-2">
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                from
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                scipy
               </span>
               <span class="crayon-r">
                import
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                sparse
               </span>
               <span class="crayon-st">
                as
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                sp
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb65f035975411-3">
               <span class="crayon-e">
               </span>
               <span class="crayon-v">
                num_terms
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-k ">
                len
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                term_dict
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb65f035975411-4">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                sp
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                csc_matrix
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-cn">
                1
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                num_terms
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb65f035975411-5">
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                for
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                t
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                in
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                terms
               </span>
               <span class="crayon-o">
                :
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb65f035975411-6">
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                if
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                t
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                in
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                term_dict
               </span>
               <span class="crayon-o">
                :
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb65f035975411-7">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                idx
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                term_dict
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-v">
                t
               </span>
               <span class="crayon-sy">
                ]
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb65f035975411-8">
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                x
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-cn">
                0
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                idx
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-cn">
                1
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb65f035975411-9">
               <span class="crayon-h">
               </span>
               <span class="crayon-st">
                return
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                x
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb65f035975411-10">
               <span class="crayon-v">
                all_terms_bcast
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                sc
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                broadcast
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                all_terms_dict
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb65f035975411-11">
               <span class="crayon-v">
                term_vectors
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                title_terms
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-k ">
                map
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-r">
                lambda
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                terms
               </span>
               <span class="crayon-o">
                :
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                create_vector
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                terms
               </span>
               <span class="crayon-sy">
                ,
               </span>
               <span class="crayon-v">
                all_terms_bcast
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                value
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb65f035975411-12">
               <span class="crayon-v">
                term_vectors
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                take
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-cn">
                5
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <!-- [Format Time: 0.0293 seconds] -->
       <p>
       </p>
       <p>
        <img src="http://blogburness1-wordpress.stor.sinaapp.com/uploads/2015/12/NewImage65.png"/>
       </p>
       <h3>
        特征归一化
       </h3>
       <ul>
        <li>
         归一化单一特征
        </li>
        <li>
         归一化特征向量
        </li>
       </ul>
       <p>
        使用MLlib来做特征归一化
       </p>
       <!-- Crayon Syntax Highlighter v_2.7.2_beta -->
       <div class="crayon-syntax crayon-theme-classic crayon-font-monaco crayon-os-pc print-yes notranslate" data-settings=" minimize scroll-mouseover" id="crayon-57686696bb666193829983" style=" margin-top: 12px; margin-bottom: 12px; font-size: 12px !important; line-height: 15px !important;">
        <div class="crayon-toolbar" data-settings=" mouseover overlay hide delay" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
         <span class="crayon-title">
         </span>
         <div class="crayon-tools" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
          <div class="crayon-button crayon-nums-button" title="切换是否显示行编号">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-plain-button" title="纯文本显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-wrap-button" title="切换自动换行">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-expand-button" title="点击展开代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-copy-button" title="复制代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-popup-button" title="在新窗口中显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <span class="crayon-language">
           Python
          </span>
         </div>
        </div>
        <div class="crayon-info" style="min-height: 16.8px !important; line-height: 16.8px !important;">
        </div>
        <div class="crayon-plain-wrap">
         <textarea class="crayon-plain print-no" data-settings="dblclick" readonly="" style="-moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4; font-size: 12px !important; line-height: 15px !important;" wrap="soft">
          from pyspark.mllib.feature import Normalizer
normlizer = Normalizer()
vector = sc.parallelize([X])
normalized_x_mllib = normlizer.transform(vector).first().toArray()
         </textarea>
        </div>
        <div class="crayon-main" style="">
         <table class="crayon-table">
          <tbody>
           <tr class="crayon-row">
            <td class="crayon-nums " data-settings="show">
             <div class="crayon-nums-content" style="font-size: 12px !important; line-height: 15px !important;">
              <div class="crayon-num" data-line="crayon-57686696bb666193829983-1">
               1
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb666193829983-2">
               2
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb666193829983-3">
               3
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb666193829983-4">
               4
              </div>
             </div>
            </td>
            <td class="crayon-code">
             <div class="crayon-pre" style="font-size: 12px !important; line-height: 15px !important; -moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4;">
              <div class="crayon-line" id="crayon-57686696bb666193829983-1">
               <span class="crayon-st">
                from
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                pyspark
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                mllib
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                feature
               </span>
               <span class="crayon-r">
                import
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                Normalizer
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb666193829983-2">
               <span class="crayon-v">
                normlizer
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                Normalizer
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb666193829983-3">
               <span class="crayon-v">
                vector
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                sc
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                parallelize
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                [
               </span>
               <span class="crayon-v">
                X
               </span>
               <span class="crayon-sy">
                ]
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb666193829983-4">
               <span class="crayon-v">
                normalized_x_mllib
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                =
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                normlizer
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                transform
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                vector
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                first
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                toArray
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <!-- [Format Time: 0.0118 seconds] -->
       <p>
       </p>
       <!-- Crayon Syntax Highlighter v_2.7.2_beta -->
       <div class="crayon-syntax crayon-theme-classic crayon-font-monaco crayon-os-pc print-yes notranslate" data-settings=" minimize scroll-mouseover" id="crayon-57686696bb66c055614708" style=" margin-top: 12px; margin-bottom: 12px; font-size: 12px !important; line-height: 15px !important;">
        <div class="crayon-toolbar" data-settings=" mouseover overlay hide delay" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
         <span class="crayon-title">
         </span>
         <div class="crayon-tools" style="font-size: 12px !important;height: 18px !important; line-height: 18px !important;">
          <div class="crayon-button crayon-nums-button" title="切换是否显示行编号">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-plain-button" title="纯文本显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-wrap-button" title="切换自动换行">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-expand-button" title="点击展开代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-copy-button" title="复制代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <div class="crayon-button crayon-popup-button" title="在新窗口中显示代码">
           <div class="crayon-button-icon">
           </div>
          </div>
          <span class="crayon-language">
           Python
          </span>
         </div>
        </div>
        <div class="crayon-info" style="min-height: 16.8px !important; line-height: 16.8px !important;">
        </div>
        <div class="crayon-plain-wrap">
         <textarea class="crayon-plain print-no" data-settings="dblclick" readonly="" style="-moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4; font-size: 12px !important; line-height: 15px !important;" wrap="soft">
          print "x:\n%s" % X
print "2-Norm of x: %2.4f" % norm_x_2
print "Normalized x:\n%s" % normalized_x
print "Normalized x MLlib:\n%s" % normalized_x_mllib
print "2-Norm of normalized_x_mllib: %2.4f" % np.linalg.norm(normalized_x_mllib)
         </textarea>
        </div>
        <div class="crayon-main" style="">
         <table class="crayon-table">
          <tbody>
           <tr class="crayon-row">
            <td class="crayon-nums " data-settings="show">
             <div class="crayon-nums-content" style="font-size: 12px !important; line-height: 15px !important;">
              <div class="crayon-num" data-line="crayon-57686696bb66c055614708-1">
               1
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb66c055614708-2">
               2
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb66c055614708-3">
               3
              </div>
              <div class="crayon-num crayon-striped-num" data-line="crayon-57686696bb66c055614708-4">
               4
              </div>
              <div class="crayon-num" data-line="crayon-57686696bb66c055614708-5">
               5
              </div>
             </div>
            </td>
            <td class="crayon-code">
             <div class="crayon-pre" style="font-size: 12px !important; line-height: 15px !important; -moz-tab-size:4; -o-tab-size:4; -webkit-tab-size:4; tab-size:4;">
              <div class="crayon-line" id="crayon-57686696bb66c055614708-1">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                "x:\n%s"
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                %
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-i">
                X
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb66c055614708-2">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                "2-Norm of x: %2.4f"
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                %
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                norm_x_2
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb66c055614708-3">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                "Normalized x:\n%s"
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                %
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                normalized_x
               </span>
              </div>
              <div class="crayon-line crayon-striped-line" id="crayon-57686696bb66c055614708-4">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                "Normalized x MLlib:\n%s"
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                %
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-e">
                normalized_x_mllib
               </span>
              </div>
              <div class="crayon-line" id="crayon-57686696bb66c055614708-5">
               <span class="crayon-k ">
                print
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-s">
                "2-Norm of normalized_x_mllib: %2.4f"
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-o">
                %
               </span>
               <span class="crayon-h">
               </span>
               <span class="crayon-v">
                np
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-v">
                linalg
               </span>
               <span class="crayon-sy">
                .
               </span>
               <span class="crayon-e">
                norm
               </span>
               <span class="crayon-sy">
                (
               </span>
               <span class="crayon-v">
                normalized_x_mllib
               </span>
               <span class="crayon-sy">
                )
               </span>
              </div>
             </div>
            </td>
           </tr>
          </tbody>
         </table>
        </div>
       </div>
       <!-- [Format Time: 0.0167 seconds] -->
       <p>
       </p>
       <p class="”prettyprint”">
        <img src="http://blogburness1-wordpress.stor.sinaapp.com/uploads/2015/12/NewImage66.png"/>
       </p>
       <p class="”prettyprint”">
        总结：spark支持多种语言，如scala,java, python，可以使用相应的包来进行特征处理，例如python下scikit-learn,gensim,svikit-image,matplotlib，notebook文件在
        <a href="https://github.com/burness/ML_Spark">
         github上
        </a>
       </p>
      </div>
      <div>
       <strong>
        注：转载文章均来自于公开网络，仅供学习使用，不会用于任何商业用途，如果侵犯到原作者的权益，请您与我们联系删除或者授权事宜，联系邮箱：contact@dataunion.org。转载数盟网站文章请注明原文章作者，否则产生的任何版权纠纷与数盟无关。
       </strong>
      </div>
      <!--content_text-->
      <div class="fenxian">
       <!-- JiaThis Button BEGIN -->
       <div class="jiathis_style_32x32">
        <p class="jiathis_button_weixin">
        </p>
        <p class="jiathis_button_tsina">
        </p>
        <p class="jiathis_button_qzone">
        </p>
        <p class="jiathis_button_cqq">
        </p>
        <p class="jiathis_button_tumblr">
        </p>
        <a class="jiathis jiathis_txt jtico jtico_jiathis" href="http://www.jiathis.com/share" target="_blank">
        </a>
        <p class="jiathis_counter_style">
        </p>
       </div>
       <!-- JiaThis Button END -->
      </div>
     </article>
     <!--content-->
     <!--相关文章-->
     <div class="xianguan">
      <div class="xianguantitle">
       相关文章！
      </div>
      <ul class="pic">
       <li>
        <a href="http://dataunion.org/24678.html">
         <img src="http://dataunion.org/wp-content/uploads/2016/06/20140917125452915416-216x200.jpg"/>
        </a>
        <a class="link" href="http://dataunion.org/24678.html" rel="bookmark" title="python3中的正则模块">
         python3中的正则模块
        </a>
       </li>
       <li>
        <a href="http://dataunion.org/24675.html">
         <img src="http://dataunion.org/wp-content/uploads/2016/06/t015b337bd75d9ef893-161x200.jpg"/>
        </a>
        <a class="link" href="http://dataunion.org/24675.html" rel="bookmark" title="注释是恶魔，请不要再写一行注释">
         注释是恶魔，请不要再写一行注释
        </a>
       </li>
       <li>
        <a href="http://dataunion.org/24660.html">
         <img src="http://dataunion.org/wp-content/uploads/2016/06/u16130037972892789947fm21gp0-300x157.jpg"/>
        </a>
        <a class="link" href="http://dataunion.org/24660.html" rel="bookmark" title="如何从Github上轻松安装R包">
         如何从Github上轻松安装R包
        </a>
       </li>
       <li>
        <a href="http://dataunion.org/24654.html">
         <img src="http://dataunion.org/wp-content/uploads/2016/06/df53fac99fc53ba5a90666abcca25e6d_b-267x200.png"/>
        </a>
        <a class="link" href="http://dataunion.org/24654.html" rel="bookmark" title="简单形象又有趣地说说强大的神经网络">
         简单形象又有趣地说说强大的神经网络
        </a>
       </li>
      </ul>
     </div>
     <!--相关文章-->
     <div class="comment" id="comments">
      <!-- You can start editing here. -->
      <!-- If comments are open, but there are no comments. -->
      <div class="title">
       期待你一针见血的评论，Come on！
      </div>
      <div id="respond">
       <p>
        不用想啦，马上
        <a href="http://dataunion.org/wp-login.php?redirect_to=http%3A%2F%2Fdataunion.org%2F22496.html">
         "登录"
        </a>
        发表自已的想法.
       </p>
      </div>
     </div>
     <!-- .nav-single -->
    </div>
    <!--Container End-->
    <aside id="sitebar">
     <div class="sitebar_list2">
      <div class="wptag">
       <span class="tagtitle">
        热门标签+
       </span>
       <div class="tagg">
        <ul class="menu" id="menu-%e5%8f%8b%e6%83%85%e9%93%be%e6%8e%a5">
         <li class="menu-item menu-item-type-custom menu-item-object-custom menu-item-1605" id="menu-item-1605">
          <a href="http://taidizh.com/">
           泰迪智慧
          </a>
         </li>
         <li class="menu-item menu-item-type-custom menu-item-object-custom menu-item-20884" id="menu-item-20884">
          <a href="http://www.transwarp.cn/">
           星环科技
          </a>
         </li>
         <li class="menu-item menu-item-type-custom menu-item-object-custom menu-item-3538" id="menu-item-3538">
          <a href="http://datall.org/">
           珈和遥感
          </a>
         </li>
         <li class="menu-item menu-item-type-custom menu-item-object-custom menu-item-20888" id="menu-item-20888">
          <a href="http://www.chinahadoop.cn/">
           小象学院
          </a>
         </li>
        </ul>
       </div>
      </div>
     </div>
     <div class="sitebar_list">
      <div class="textwidget">
       <div align="center">
        <a href="http://study.163.com/course/courseMain.htm?courseId=991022" target="_blank">
         <img src="http://dataunion.org/wp-content/uploads/2016/03/dv.jpg"/>
        </a>
       </div>
      </div>
     </div>
     <div class="sitebar_list">
      <h4 class="sitebar_title">
       文章分类
      </h4>
      <div class="tagcloud">
       <a class="tag-link-44" href="http://dataunion.org/category/industry/demo" style="font-size: 10.204724409449pt;" title="4个话题">
        Demo展示
       </a>
       <a class="tag-link-31" href="http://dataunion.org/category/experts" style="font-size: 15.826771653543pt;" title="52个话题">
        专家团队
       </a>
       <a class="tag-link-870" href="http://dataunion.org/category/tech/ai" style="font-size: 19.795275590551pt;" title="273个话题">
        人工智能
       </a>
       <a class="tag-link-488" href="http://dataunion.org/category/%e5%8a%a0%e5%85%a5%e6%95%b0%e7%9b%9f" style="font-size: 8pt;" title="1个话题">
        加入数盟
       </a>
       <a class="tag-link-869" href="http://dataunion.org/category/tech/viz" style="font-size: 17.204724409449pt;" title="93个话题">
        可视化
       </a>
       <a class="tag-link-30" href="http://dataunion.org/category/partners" style="font-size: 10.645669291339pt;" title="5个话题">
        合作伙伴
       </a>
       <a class="tag-link-889" href="http://dataunion.org/category/parterc" style="font-size: 11.582677165354pt;" title="8个话题">
        合作会议
       </a>
       <a class="tag-link-104" href="http://dataunion.org/category/books" style="font-size: 12.96062992126pt;" title="15个话题">
        图书
       </a>
       <a class="tag-link-220" href="http://dataunion.org/category/tech/base" style="font-size: 19.850393700787pt;" title="281个话题">
        基础架构
       </a>
       <a class="tag-link-219" href="http://dataunion.org/category/tech/analysis" style="font-size: 19.409448818898pt;" title="232个话题">
        数据分析
       </a>
       <a class="tag-link-887" href="http://dataunion.org/category/tech/dm" style="font-size: 13.291338582677pt;" title="17个话题">
        数据挖掘
       </a>
       <a class="tag-link-34" href="http://dataunion.org/category/tech" style="font-size: 20.732283464567pt;" title="404个话题">
        文章
       </a>
       <a class="tag-link-1" href="http://dataunion.org/category/uncategorized" style="font-size: 22pt;" title="693个话题">
        未分类
       </a>
       <a class="tag-link-4" href="http://dataunion.org/category/events" style="font-size: 14.503937007874pt;" title="29个话题">
        活动
       </a>
       <a class="tag-link-890" href="http://dataunion.org/category/tech/%e6%b7%b1%e5%ba%a6%e5%ad%a6%e4%b9%a0" style="font-size: 10.204724409449pt;" title="4个话题">
        深度学习
       </a>
       <a class="tag-link-221" href="http://dataunion.org/category/tech/devl" style="font-size: 18.968503937008pt;" title="193个话题">
        编程语言
       </a>
       <a class="tag-link-888" href="http://dataunion.org/category/career" style="font-size: 15.661417322835pt;" title="48个话题">
        职业规划
       </a>
       <a class="tag-link-5" href="http://dataunion.org/category/jobs" style="font-size: 14.11811023622pt;" title="25个话题">
        职位
       </a>
       <a class="tag-link-871" href="http://dataunion.org/category/industry" style="font-size: 15.716535433071pt;" title="49个话题">
        行业
       </a>
       <a class="tag-link-613" href="http://dataunion.org/category/industry/case" style="font-size: 16.984251968504pt;" title="84个话题">
        行业应用
       </a>
       <a class="tag-link-885" href="http://dataunion.org/category/industry/news" style="font-size: 17.425196850394pt;" title="102个话题">
        行业资讯
       </a>
       <a class="tag-link-10" href="http://dataunion.org/category/training" style="font-size: 14.228346456693pt;" title="26个话题">
        课程
       </a>
       <a class="tag-link-16" href="http://dataunion.org/category/sources" style="font-size: 15.661417322835pt;" title="48个话题">
        资源
       </a>
      </div>
     </div>
     <div class="sitebar_list">
      <h4 class="sitebar_title">
       功能
      </h4>
      <ul>
       <li>
        <a href="http://dataunion.org/wp-login.php?action=register">
         注册
        </a>
       </li>
       <li>
        <a href="http://dataunion.org/wp-login.php">
         登录
        </a>
       </li>
       <li>
        <a href="http://dataunion.org/feed">
         文章
         <abbr title="Really Simple Syndication">
          RSS
         </abbr>
        </a>
       </li>
       <li>
        <a href="http://dataunion.org/comments/feed">
         评论
         <abbr title="Really Simple Syndication">
          RSS
         </abbr>
        </a>
       </li>
       <li>
        <a href="https://cn.wordpress.org/" title="基于WordPress，一个优美、先进的个人信息发布平台。">
         WordPress.org
        </a>
       </li>
      </ul>
     </div>
    </aside>
    <div class="clear">
    </div>
   </div>
   <!--main-->
   ﻿
   <footer id="dibu">
    <div class="about">
     <div class="right">
      <ul class="menu" id="menu-%e5%ba%95%e9%83%a8%e8%8f%9c%e5%8d%95">
       <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-18024" id="menu-item-18024">
        <a href="http://dataunion.org/category/partners">
         合作伙伴
        </a>
       </li>
       <li class="menu-item menu-item-type-post_type menu-item-object-page menu-item-20881" id="menu-item-20881">
        <a href="http://dataunion.org/contribute">
         文章投稿
        </a>
       </li>
       <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-20872" id="menu-item-20872">
        <a href="http://dataunion.org/category/%e5%8a%a0%e5%85%a5%e6%95%b0%e7%9b%9f">
         加入数盟
        </a>
       </li>
       <li class="menu-item menu-item-type-post_type menu-item-object-page menu-item-22441" id="menu-item-22441">
        <a href="http://dataunion.org/f-links">
         友情链接
        </a>
       </li>
       <li class="menu-item menu-item-type-post_type menu-item-object-page menu-item-20874" id="menu-item-20874">
        <a href="http://dataunion.org/aboutus">
         关于数盟
        </a>
       </li>
      </ul>
      <p class="banquan">
       数盟社区        ，
        做最棒的数据科学社区
      </p>
     </div>
     <div class="left">
      <ul class="bottomlist">
       <li>
        <a href="http://weibo.com/DataScientistUnion  " target="_blank" 　title="">
         <img src="http://dataunion.org/wp-content/themes/yzipi/images/weibo.png"/>
        </a>
       </li>
       <li>
        <a class="cd-popup-trigger" href="http://dataunion.org/22496.html#0">
         <img src="http://dataunion.org/wp-content/themes/yzipi/images/weixin.png"/>
        </a>
       </li>
      </ul>
      <div class="cd-popup">
       <div class="cd-popup-container">
        <h1>
         扫描二维码,加微信公众号
        </h1>
        <img src="http://dataunion.org/wp-content/themes/yzipi/images/2014-12-06-1515289049.png"/>
        <a class="cd-popup-close" href="http://dataunion.org/22496.html">
        </a>
       </div>
       <!-- cd-popup-container -->
      </div>
      <!-- cd-popup -->
     </div>
    </div>
    <!--about-->
    <div class="bottom">
     <a href="http://dataunion.org/">
      数盟社区
     </a>
     <a href="http://www.miitbeian.gov.cn/" rel="external nofollow" target="_blank">
      京ICP备14026740号
     </a>
     联系我们：
     <a href="mailto:contact@dataunion.org" target="_blank">
      contact@dataunion.org
     </a>
     <div class="tongji">
     </div>
     <!--bottom-->
     <div class="scroll" id="scroll" style="display:none;">
      ︿
     </div>
    </div>
   </footer>
   <!--dibu-->
  </div>
 </body>
</html>